install.packages("e1071",repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## e1071 1.7-1 1.7-2 TRUE
##
## Binaries will be installed
## package 'e1071' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
install.packages("ggplot2",repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## ggplot2 3.1.1 3.2.0 FALSE
## installing the source package 'ggplot2'
install.packages("corrplot",repos = "http://cran.us.r-project.org")
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
install.packages("ggcorrplot",repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## ggcorrplot 0.1.2 0.1.3 FALSE
## installing the source package 'ggcorrplot'
install.packages("klaR",repos = "http://cran.us.r-project.org")
## package 'klaR' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
install.packages("cluster",repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## cluster 2.0.8 2.1.0 TRUE
##
## Binaries will be installed
## package 'cluster' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
install.packages("fpc",repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## fpc 2.1-11.1 2.2-3 FALSE
## installing the source package 'fpc'
install.packages("class",repos = "http://cran.us.r-project.org")
## package 'class' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
install.packages("rpart",repos = "http://cran.us.r-project.org")
## package 'rpart' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Abdullah\AppData\Local\Temp\Rtmpyoo2hC\downloaded_packages
library("rpart")
## Warning: package 'rpart' was built under R version 3.4.4
library("class")
## Warning: package 'class' was built under R version 3.4.4
library("fpc")
library("cluster")
## Warning: package 'cluster' was built under R version 3.4.4
library("plyr")
## Warning: package 'plyr' was built under R version 3.4.4
library("klaR")
## Warning: package 'klaR' was built under R version 3.4.4
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.4.4
library("ggplot2")
library("e1071")
## Warning: package 'e1071' was built under R version 3.4.4
library("corrplot")
## Warning: package 'corrplot' was built under R version 3.4.4
## corrplot 0.84 loaded
library("ggcorrplot")
In this part we shall mostly look to clean our data, which includes: address missing/duplicate values, look for outliers, correct data types , fixing categorical variables, Distribution of variables, Low variance filter.
raw_data = read.csv2(file='C:\\Users\\Abdullah\\Desktop\\housePrices\\train.csv', header = T, sep = ",", dec = ".",stringsAsFactors = FALSE)
modified_data = raw_data ## To ensure we do not touch original data, we replicate into another df and use that as modified version
str(raw_data)
## 'data.frame': 1460 obs. of 81 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : chr "RL" "RL" "RL" "RL" ...
## $ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ Street : chr "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr NA NA NA NA ...
## $ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : chr "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : int 196 0 162 0 350 0 186 240 0 0 ...
## $ ExterQual : chr "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr "TA" "TA" "TA" "TA" ...
## $ Foundation : chr "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtFinType2 : chr "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : int 0 0 0 0 0 0 0 32 0 0 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : chr "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr "Y" "Y" "Y" "Y" ...
## $ Electrical : chr "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : chr "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : chr "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : chr NA "TA" "TA" "Gd" ...
## $ GarageType : chr "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
## $ GarageFinish : chr "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : int 2 2 2 3 3 2 2 2 2 1 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : chr "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
## $ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
## $ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr NA NA NA NA ...
## $ Fence : chr NA NA NA NA ...
## $ MiscFeature : chr NA NA NA NA ...
## $ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : chr "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
head(raw_data)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
As we can see, we have 81 variables in our data set; 1 ID and 1 salesPrice, 43 categorical and 36 quantitative. Data entries are either int or char data type.
summary(raw_data$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
typeof(raw_data$SalePrice)
## [1] "integer"
myhist <- hist(raw_data$SalePrice)
multiplier <- myhist$counts / myhist$density
mydensity <- density(raw_data$SalePrice)
mydensity$y <- mydensity$y * multiplier[1]
plot(myhist, xlab = "Sales Price", main = "Histogram of Sales")
lines(mydensity)
We can see that the average sales price of a house is $181k, the distribution of which is skewed to the right. As we can see from the histogram, the right tail is longer and the mass of the concentration of data is to the left of the graph, suggesting a positive skew value. The peak also looks very sharp, assuming a high kurtosis value.
boxplot(raw_data$SalePrice)
We can see that there are many outlier in the SalePrice. In fact, we can measure knowing that outliers are classified as values over 3rd Quartile + 1.5*(IQR)
count = 0;
x = 1;
while (x < 1461) {
if (raw_data$SalePrice[x] > 340000){
count = count + 1
}
x = x + 1
}
cat("Number of Sales that lie as outliers are: ", count)
## Number of Sales that lie as outliers are: 61
Now that we’ve seen that there are 61 outliers in the Sale Price, lets look into them.
outlier_sales <- subset(raw_data, raw_data$SalePrice > 340000)
outlier_sales
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 12 12 60 RL 85 11924 Pave <NA> IR1
## 54 54 20 RL 68 50271 Pave <NA> IR1
## 59 59 60 RL 66 13682 Pave <NA> IR2
## 113 113 60 RL 77 9965 Pave <NA> Reg
## 152 152 20 RL 107 13891 Pave <NA> Reg
## 162 162 60 RL 110 13688 Pave <NA> IR1
## 179 179 20 RL 63 17423 Pave <NA> IR1
## 186 186 75 RM 90 22950 Pave <NA> IR2
## 225 225 20 RL 103 13472 Pave <NA> Reg
## 232 232 60 RL 174 15138 Pave <NA> IR1
## 279 279 20 RL 107 14450 Pave <NA> Reg
## 310 310 20 RL 90 12378 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 12 Lvl AllPub Inside Gtl NridgHt Norm
## 54 Low AllPub Inside Gtl Veenker Norm
## 59 HLS AllPub CulDSac Gtl StoneBr Norm
## 113 Lvl AllPub Inside Gtl CollgCr Norm
## 152 Lvl AllPub Inside Gtl NridgHt Norm
## 162 Lvl AllPub Inside Gtl NridgHt Norm
## 179 Lvl AllPub CulDSac Gtl StoneBr Norm
## 186 Lvl AllPub Inside Gtl OldTown Artery
## 225 Lvl AllPub Inside Gtl NridgHt Norm
## 232 Lvl AllPub Inside Gtl NoRidge Norm
## 279 Lvl AllPub Inside Gtl NridgHt Norm
## 310 Lvl AllPub Inside Gtl NridgHt Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 12 Norm 1Fam 2Story 9 5 2005
## 54 Norm 1Fam 1Story 9 5 1981
## 59 Norm 1Fam 2Story 10 5 2006
## 113 Norm 1Fam 2Story 7 5 2007
## 152 Norm 1Fam 1Story 8 5 2007
## 162 Norm 1Fam 2Story 9 5 2003
## 179 Norm 1Fam 1Story 9 5 2008
## 186 Norm 1Fam 2.5Fin 10 9 1892
## 225 Norm 1Fam 1Story 10 5 2003
## 232 Norm 1Fam 2Story 8 5 1995
## 279 Norm 1Fam 1Story 9 5 2006
## 310 Norm 1Fam 1Story 9 5 2003
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 12 2006 Hip CompShg WdShing Wd Shng Stone
## 54 1987 Gable WdShngl WdShing Wd Shng None
## 59 2006 Hip CompShg VinylSd VinylSd BrkFace
## 113 2007 Gable CompShg VinylSd VinylSd Stone
## 152 2008 Hip CompShg VinylSd VinylSd Stone
## 162 2004 Gable CompShg VinylSd VinylSd BrkFace
## 179 2009 Hip CompShg VinylSd VinylSd Stone
## 186 1993 Gable WdShngl Wd Sdng Wd Sdng None
## 225 2003 Hip CompShg VinylSd VinylSd BrkFace
## 232 1996 Gable CompShg VinylSd VinylSd BrkFace
## 279 2007 Gable CompShg CemntBd CmentBd BrkFace
## 310 2004 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 12 286 Ex TA PConc Ex TA
## 54 0 Gd TA CBlock Ex TA
## 59 1031 Ex TA PConc Ex TA
## 113 220 Gd TA PConc Ex TA
## 152 436 Gd TA PConc Ex TA
## 162 664 Gd TA PConc Ex TA
## 179 748 Ex TA PConc Ex TA
## 186 0 Gd Gd BrkTil TA TA
## 225 922 Ex TA PConc Ex TA
## 232 506 Gd TA PConc Gd TA
## 279 315 Ex TA PConc Ex TA
## 310 0 Gd TA PConc Ex TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 12 No GLQ 998 Unf 0
## 54 Gd GLQ 1810 Unf 0
## 59 Gd Unf 0 Unf 0
## 113 Av GLQ 984 Unf 0
## 152 Gd GLQ 1400 Unf 0
## 162 Av GLQ 1016 Unf 0
## 179 No GLQ 1904 Unf 0
## 186 Mn Unf 0 Unf 0
## 225 Gd GLQ 56 Unf 0
## 232 No GLQ 689 Unf 0
## 279 Gd Unf 0 Unf 0
## 310 Gd GLQ 1274 Unf 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 12 177 1175 GasA Ex Y SBrkr
## 54 32 1842 GasA Gd Y SBrkr
## 59 1410 1410 GasA Ex Y SBrkr
## 113 280 1264 GasA Ex Y SBrkr
## 152 310 1710 GasA Ex Y SBrkr
## 162 556 1572 GasA Ex Y SBrkr
## 179 312 2216 GasA Ex Y SBrkr
## 186 1107 1107 GasA Ex Y SBrkr
## 225 2336 2392 GasA Ex Y SBrkr
## 232 773 1462 GasA Ex Y SBrkr
## 279 2121 2121 GasA Ex Y SBrkr
## 310 622 1896 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 12 1182 1142 0 2324 1 0
## 54 1842 0 0 1842 2 0
## 59 1426 1519 0 2945 0 0
## 113 1282 1414 0 2696 1 0
## 152 1710 0 0 1710 1 0
## 162 1572 1096 0 2668 1 0
## 179 2234 0 0 2234 1 0
## 186 1518 1518 572 3608 0 0
## 225 2392 0 0 2392 0 0
## 232 1490 1304 0 2794 1 0
## 279 2121 0 0 2121 0 0
## 310 1944 0 0 1944 1 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 12 3 0 4 1 Ex 11
## 54 0 1 0 1 Gd 5
## 59 3 1 3 1 Gd 10
## 113 2 1 4 1 Ex 10
## 152 2 0 2 1 Gd 6
## 162 2 1 3 1 Ex 10
## 179 2 0 1 1 Ex 9
## 186 2 1 4 1 Ex 12
## 225 2 0 3 1 Ex 8
## 232 2 1 4 1 Ex 9
## 279 2 1 3 1 Ex 8
## 310 2 0 3 1 Ex 8
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 12 Typ 2 Gd BuiltIn 2005 Fin
## 54 Typ 1 Gd Attchd 1981 Fin
## 59 Typ 1 Gd BuiltIn 2006 Fin
## 113 Typ 1 Gd BuiltIn 2007 Fin
## 152 Typ 1 Gd Attchd 2007 RFn
## 162 Typ 2 Gd BuiltIn 2003 Fin
## 179 Typ 1 Gd Attchd 2009 Fin
## 186 Typ 2 TA Detchd 1993 Unf
## 225 Typ 1 Ex Attchd 2003 Fin
## 232 Typ 1 TA Attchd 1995 Fin
## 279 Typ 1 Ex Attchd 2007 Fin
## 310 Typ 3 Ex Attchd 2003 Fin
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 12 3 736 TA TA Y 147
## 54 3 894 TA TA Y 857
## 59 3 641 TA TA Y 192
## 113 3 792 TA TA Y 120
## 152 3 866 TA TA Y 0
## 162 3 726 TA TA Y 400
## 179 3 1166 TA TA Y 0
## 186 3 840 Ex TA Y 0
## 225 3 968 TA TA Y 248
## 232 3 810 TA TA Y 0
## 279 3 732 TA TA Y 124
## 310 3 708 TA TA Y 208
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 12 21 0 0 0 0 <NA>
## 54 72 0 0 0 0 <NA>
## 59 0 37 0 0 0 <NA>
## 113 184 0 0 168 0 <NA>
## 152 102 0 0 0 0 <NA>
## 162 0 0 0 0 0 <NA>
## 179 60 0 0 0 0 <NA>
## 186 260 0 0 410 0 <NA>
## 225 105 0 0 0 0 <NA>
## 232 146 202 0 0 0 <NA>
## 279 98 0 0 142 0 <NA>
## 310 175 0 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 12 <NA> <NA> 0 7 2006 New Partial
## 54 <NA> <NA> 0 11 2006 WD Normal
## 59 <NA> <NA> 0 10 2006 New Partial
## 113 <NA> <NA> 0 10 2007 New Partial
## 152 <NA> <NA> 0 1 2008 New Partial
## 162 <NA> <NA> 0 3 2008 WD Normal
## 179 <NA> <NA> 0 7 2009 New Partial
## 186 GdPrv <NA> 0 6 2006 WD Normal
## 225 <NA> <NA> 0 6 2009 WD Normal
## 232 <NA> <NA> 0 7 2009 WD Normal
## 279 <NA> <NA> 0 5 2007 New Partial
## 310 <NA> <NA> 0 11 2006 WD Normal
## SalePrice
## 12 345000
## 54 385000
## 59 438780
## 113 383970
## 152 372402
## 162 412500
## 179 501837
## 186 475000
## 225 386250
## 232 403000
## 279 415298
## 310 360000
## [ reached getOption("max.print") -- omitted 49 rows ]
As we can see, there are missing values for many of the entries. Perhaps it is better that we look to address missing values in our other 80 variables.
# Counting number of nulls in each col
x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
if(sum(is.na((raw_data[x]))>0)){
cat("Number of nulls in ",(colnames(raw_data[x])), ": ")
cat(sum(is.na(raw_data[x])), "\n")
}
x = x + 1
}
## Number of nulls in LotFrontage : 259
## Number of nulls in Alley : 1369
## Number of nulls in MasVnrType : 8
## Number of nulls in MasVnrArea : 8
## Number of nulls in BsmtQual : 37
## Number of nulls in BsmtCond : 37
## Number of nulls in BsmtExposure : 38
## Number of nulls in BsmtFinType1 : 37
## Number of nulls in BsmtFinType2 : 38
## Number of nulls in Electrical : 1
## Number of nulls in FireplaceQu : 690
## Number of nulls in GarageType : 81
## Number of nulls in GarageYrBlt : 81
## Number of nulls in GarageFinish : 81
## Number of nulls in GarageQual : 81
## Number of nulls in GarageCond : 81
## Number of nulls in PoolQC : 1453
## Number of nulls in Fence : 1179
## Number of nulls in MiscFeature : 1406
There are 19 variables with NA values. It is important to note that this might mean that the observation is missing or perhaps a NA means somthing itself. We must consult the data dictionary.
LotFrontage has 259 NA values. This is integer value repersenting Linear feet of street connected to property. Looking into the dataset, we see that the other observations have a value anywhere from 21-313. We can safetly assume that these 259 entries repersent a house that has 0 linear feet of street being connected to the property. Of course it could be a case of missin value, but it is possible that the homes do not have the property connecting to a steet, we take this assumptions and instead change the NAs to 0. As such, We will not be removing such observations
v = 1
while (v<1461){
if (is.na(modified_data$LotFrontage[v])){
modified_data$LotFrontage[v] = 0
}
v = v + 1
}
Alley has 1369 NAs, this high number suggests that the NAs must mean somthing rather than missing value. The dictionary shows us the NA repersents no alley Access. Rather than Na, lets change that to ‘None’ a bit more repersentative.
v = 1
while (v<1461){
if (is.na(modified_data$Alley[v])){
modified_data$Alley[v] = "None"
}
v = v + 1
}
MasVnrType and MasVnrArea both have 8 missing values. This is very suspeious. Could it be that they are of the same observations and due to another house feature? Let us check. Maybe our data dictionary can help us with this.
MasVnr <- subset(raw_data, is.na(raw_data$MasVnrType))
MasVnr$Area <- subset(raw_data, is.na(raw_data$MasVnrArea))
MasVnr
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 235 235 60 RL NA 7851 Pave <NA> Reg
## 530 530 20 RL NA 32668 Pave <NA> IR1
## 651 651 60 FV 65 8125 Pave <NA> Reg
## 937 937 20 RL 67 10083 Pave <NA> Reg
## 974 974 20 FV 95 11639 Pave <NA> Reg
## 978 978 120 FV 35 4274 Pave Pave IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 235 Lvl AllPub Inside Gtl Gilbert Norm
## 530 Lvl AllPub CulDSac Gtl Crawfor Norm
## 651 Lvl AllPub Inside Gtl Somerst Norm
## 937 Lvl AllPub Inside Gtl SawyerW Norm
## 974 Lvl AllPub Corner Gtl Somerst Norm
## 978 Lvl AllPub Inside Gtl Somerst Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 235 Norm 1Fam 2Story 6 5 2002
## 530 Norm 1Fam 1Story 6 3 1957
## 651 Norm 1Fam 2Story 7 6 2007
## 937 Norm 1Fam 1Story 7 5 2003
## 974 Norm 1Fam 1Story 7 5 2007
## 978 Norm TwnhsE 1Story 7 5 2006
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 235 2002 Gable CompShg VinylSd VinylSd <NA>
## 530 1975 Hip CompShg Wd Sdng Stone <NA>
## 651 2007 Gable CompShg CemntBd CmentBd <NA>
## 937 2003 Gable CompShg VinylSd VinylSd <NA>
## 974 2008 Gable CompShg CemntBd CmentBd <NA>
## 978 2007 Gable CompShg VinylSd VinylSd <NA>
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 235 NA Gd TA PConc Gd TA
## 530 NA Gd TA PConc TA TA
## 651 NA Gd TA PConc Gd TA
## 937 NA Gd TA PConc Gd TA
## 974 NA Gd TA PConc Gd TA
## 978 NA Gd TA PConc Gd TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 235 No GLQ 625 Unf 0
## 530 No Rec 1219 Unf 0
## 651 No Unf 0 Unf 0
## 937 No GLQ 833 Unf 0
## 974 No Unf 0 Unf 0
## 978 No GLQ 1106 Unf 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 235 235 860 GasA Ex Y SBrkr
## 530 816 2035 GasA TA Y SBrkr
## 651 813 813 GasA Ex Y SBrkr
## 937 343 1176 GasA Ex Y SBrkr
## 974 1428 1428 GasA Ex Y SBrkr
## 978 135 1241 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 235 860 1100 0 1960 1 0
## 530 2515 0 0 2515 1 0
## 651 822 843 0 1665 0 0
## 937 1200 0 0 1200 1 0
## 974 1428 0 0 1428 0 0
## 978 1241 0 0 1241 1 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 235 2 1 4 1 Gd 8
## 530 3 0 4 2 TA 9
## 651 2 1 3 1 Gd 7
## 937 2 0 2 1 Gd 5
## 974 2 0 3 1 Gd 6
## 978 1 1 1 1 Gd 4
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 235 Typ 2 TA BuiltIn 2002 Fin
## 530 Maj1 2 TA Attchd 1975 RFn
## 651 Typ 0 <NA> Attchd 2007 RFn
## 937 Typ 0 <NA> Attchd 2003 RFn
## 974 Typ 0 <NA> Attchd 2007 Fin
## 978 Typ 0 <NA> Attchd 2007 Fin
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 235 2 440 TA TA Y 288
## 530 2 484 TA TA Y 0
## 651 2 562 TA TA Y 0
## 937 2 555 TA TA Y 0
## 974 2 480 TA TA Y 0
## 978 2 569 TA TA Y 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 235 48 0 0 0 0 <NA>
## 530 0 200 0 0 0 <NA>
## 651 0 0 0 0 0 <NA>
## 937 41 0 0 0 0 <NA>
## 974 120 0 0 0 0 <NA>
## 978 116 0 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 235 <NA> <NA> 0 5 2010 WD Normal
## 530 <NA> <NA> 0 3 2007 WD Alloca
## 651 <NA> <NA> 0 5 2008 WD Normal
## 937 <NA> <NA> 0 8 2009 WD Normal
## 974 <NA> <NA> 0 12 2008 New Partial
## 978 <NA> <NA> 0 11 2007 New Partial
## SalePrice Area.Id Area.MSSubClass Area.MSZoning Area.LotFrontage
## 235 216500 235 60 RL NA
## 530 200624 530 20 RL NA
## 651 205950 651 60 FV 65
## 937 184900 937 20 RL 67
## 974 182000 974 20 FV 95
## 978 199900 978 120 FV 35
## Area.LotArea Area.Street Area.Alley Area.LotShape Area.LandContour
## 235 7851 Pave <NA> Reg Lvl
## 530 32668 Pave <NA> IR1 Lvl
## 651 8125 Pave <NA> Reg Lvl
## 937 10083 Pave <NA> Reg Lvl
## 974 11639 Pave <NA> Reg Lvl
## 978 4274 Pave Pave IR1 Lvl
## Area.Utilities Area.LotConfig Area.LandSlope Area.Neighborhood
## 235 AllPub Inside Gtl Gilbert
## 530 AllPub CulDSac Gtl Crawfor
## 651 AllPub Inside Gtl Somerst
## 937 AllPub Inside Gtl SawyerW
## 974 AllPub Corner Gtl Somerst
## 978 AllPub Inside Gtl Somerst
## Area.Condition1 Area.Condition2 Area.BldgType Area.HouseStyle
## 235 Norm Norm 1Fam 2Story
## 530 Norm Norm 1Fam 1Story
## 651 Norm Norm 1Fam 2Story
## 937 Norm Norm 1Fam 1Story
## 974 Norm Norm 1Fam 1Story
## 978 Norm Norm TwnhsE 1Story
## Area.OverallQual Area.OverallCond Area.YearBuilt Area.YearRemodAdd
## 235 6 5 2002 2002
## 530 6 3 1957 1975
## 651 7 6 2007 2007
## 937 7 5 2003 2003
## 974 7 5 2007 2008
## 978 7 5 2006 2007
## Area.RoofStyle Area.RoofMatl Area.Exterior1st Area.Exterior2nd
## 235 Gable CompShg VinylSd VinylSd
## 530 Hip CompShg Wd Sdng Stone
## 651 Gable CompShg CemntBd CmentBd
## 937 Gable CompShg VinylSd VinylSd
## 974 Gable CompShg CemntBd CmentBd
## 978 Gable CompShg VinylSd VinylSd
## Area.MasVnrType Area.MasVnrArea Area.ExterQual Area.ExterCond
## 235 <NA> NA Gd TA
## 530 <NA> NA Gd TA
## 651 <NA> NA Gd TA
## 937 <NA> NA Gd TA
## 974 <NA> NA Gd TA
## 978 <NA> NA Gd TA
## Area.Foundation Area.BsmtQual Area.BsmtCond Area.BsmtExposure
## 235 PConc Gd TA No
## 530 PConc TA TA No
## 651 PConc Gd TA No
## 937 PConc Gd TA No
## 974 PConc Gd TA No
## 978 PConc Gd TA No
## Area.BsmtFinType1 Area.BsmtFinSF1 Area.BsmtFinType2 Area.BsmtFinSF2
## 235 GLQ 625 Unf 0
## 530 Rec 1219 Unf 0
## 651 Unf 0 Unf 0
## 937 GLQ 833 Unf 0
## 974 Unf 0 Unf 0
## 978 GLQ 1106 Unf 0
## Area.BsmtUnfSF Area.TotalBsmtSF Area.Heating Area.HeatingQC
## 235 235 860 GasA Ex
## 530 816 2035 GasA TA
## 651 813 813 GasA Ex
## 937 343 1176 GasA Ex
## 974 1428 1428 GasA Ex
## 978 135 1241 GasA Ex
## Area.CentralAir Area.Electrical Area.X1stFlrSF Area.X2ndFlrSF
## 235 Y SBrkr 860 1100
## 530 Y SBrkr 2515 0
## 651 Y SBrkr 822 843
## 937 Y SBrkr 1200 0
## 974 Y SBrkr 1428 0
## 978 Y SBrkr 1241 0
## Area.LowQualFinSF Area.GrLivArea Area.BsmtFullBath Area.BsmtHalfBath
## 235 0 1960 1 0
## 530 0 2515 1 0
## 651 0 1665 0 0
## 937 0 1200 1 0
## 974 0 1428 0 0
## 978 0 1241 1 0
## Area.FullBath Area.HalfBath Area.BedroomAbvGr Area.KitchenAbvGr
## 235 2 1 4 1
## 530 3 0 4 2
## 651 2 1 3 1
## 937 2 0 2 1
## 974 2 0 3 1
## 978 1 1 1 1
## Area.KitchenQual Area.TotRmsAbvGrd Area.Functional Area.Fireplaces
## 235 Gd 8 Typ 2
## 530 TA 9 Maj1 2
## 651 Gd 7 Typ 0
## 937 Gd 5 Typ 0
## 974 Gd 6 Typ 0
## 978 Gd 4 Typ 0
## Area.FireplaceQu Area.GarageType Area.GarageYrBlt Area.GarageFinish
## 235 TA BuiltIn 2002 Fin
## 530 TA Attchd 1975 RFn
## 651 <NA> Attchd 2007 RFn
## 937 <NA> Attchd 2003 RFn
## 974 <NA> Attchd 2007 Fin
## 978 <NA> Attchd 2007 Fin
## Area.GarageCars Area.GarageArea Area.GarageQual Area.GarageCond
## 235 2 440 TA TA
## 530 2 484 TA TA
## 651 2 562 TA TA
## 937 2 555 TA TA
## 974 2 480 TA TA
## 978 2 569 TA TA
## Area.PavedDrive Area.WoodDeckSF Area.OpenPorchSF Area.EnclosedPorch
## 235 Y 288 48 0
## 530 Y 0 0 200
## 651 Y 0 0 0
## 937 Y 0 41 0
## 974 Y 0 120 0
## 978 Y 0 116 0
## Area.X3SsnPorch Area.ScreenPorch Area.PoolArea Area.PoolQC Area.Fence
## 235 0 0 0 <NA> <NA>
## 530 0 0 0 <NA> <NA>
## 651 0 0 0 <NA> <NA>
## 937 0 0 0 <NA> <NA>
## 974 0 0 0 <NA> <NA>
## 978 0 0 0 <NA> <NA>
## Area.MiscFeature Area.MiscVal Area.MoSold Area.YrSold Area.SaleType
## 235 <NA> 0 5 2010 WD
## 530 <NA> 0 3 2007 WD
## 651 <NA> 0 5 2008 WD
## 937 <NA> 0 8 2009 WD
## 974 <NA> 0 12 2008 New
## 978 <NA> 0 11 2007 New
## Area.SaleCondition Area.SalePrice
## 235 Normal 216500
## 530 Alloca 200624
## 651 Normal 205950
## 937 Normal 184900
## 974 Partial 182000
## 978 Partial 199900
## [ reached getOption("max.print") -- omitted 2 rows ]
These both variables are Na in the same observation. What is weird is that MasVnrType has a None category, so that means that it isn’t a case that there was no masonry veneer, rather I would say that the Masonry data was not collected for these 8 observations. This would tell me that it might be good to remove the observations
modified_data <- subset(modified_data, !is.na(raw_data$MasVnrType)) # we only do it for when one attribute isnt na, because the other overlaps.
Next lets look at BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2. All these have 37-38 missing values. Lets see if they are overlapping observations again, and see if we can figure out a pattern.
Bsmt <- subset(raw_data, is.na(raw_data$BsmtExposure))
Bsmt
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 18 18 90 RL 72 10791 Pave <NA> Reg
## 40 40 90 RL 65 6040 Pave <NA> Reg
## 91 91 20 RL 60 7200 Pave <NA> Reg
## 103 103 90 RL 64 7018 Pave <NA> Reg
## 157 157 20 RL 60 7200 Pave <NA> Reg
## 183 183 20 RL 60 9060 Pave <NA> Reg
## 260 260 20 RM 70 12702 Pave <NA> Reg
## 343 343 90 RL NA 8544 Pave <NA> Reg
## 363 363 85 RL 64 7301 Pave <NA> Reg
## 372 372 50 RL 80 17120 Pave <NA> Reg
## 393 393 20 RL NA 8339 Pave <NA> IR1
## 521 521 190 RL 60 10800 Pave Grvl Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 18 Lvl AllPub Inside Gtl Sawyer Norm
## 40 Lvl AllPub Inside Gtl Edwards Norm
## 91 Lvl AllPub Inside Gtl NAmes Norm
## 103 Bnk AllPub Inside Gtl SawyerW Norm
## 157 Lvl AllPub Inside Gtl NAmes Norm
## 183 Lvl AllPub Inside Gtl Edwards Artery
## 260 Lvl AllPub Inside Gtl OldTown Norm
## 343 Lvl AllPub Inside Gtl NAmes Norm
## 363 Lvl AllPub Corner Gtl Edwards Norm
## 372 Lvl AllPub Inside Gtl ClearCr Feedr
## 393 Lvl AllPub Inside Gtl NAmes Norm
## 521 Lvl AllPub Inside Gtl OldTown Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 18 Norm Duplex 1Story 4 5 1967
## 40 Norm Duplex 1Story 4 5 1955
## 91 Norm 1Fam 1Story 4 5 1950
## 103 Norm Duplex 1Story 5 5 1979
## 157 Norm 1Fam 1Story 5 7 1950
## 183 Norm 1Fam 1Story 5 6 1957
## 260 Norm 1Fam 1Story 5 5 1956
## 343 Norm Duplex 1Story 3 4 1949
## 363 Norm 1Fam SFoyer 7 5 2003
## 372 Norm 1Fam 1.5Fin 4 4 1959
## 393 Norm 1Fam 1Story 5 7 1959
## 521 Norm 2fmCon 2Story 4 7 1900
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 18 1967 Gable CompShg MetalSd MetalSd None
## 40 1955 Gable CompShg AsbShng Plywood None
## 91 1950 Gable CompShg BrkFace Wd Sdng None
## 103 1979 Gable CompShg HdBoard HdBoard None
## 157 1950 Hip CompShg Wd Sdng Wd Sdng None
## 183 2006 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 260 1956 Gable CompShg BrkFace BrkFace None
## 343 1950 Gable CompShg Stucco Stucco BrkFace
## 363 2003 Gable CompShg HdBoard HdBoard BrkFace
## 372 1959 Gable CompShg WdShing Plywood None
## 393 1959 Gable CompShg MetalSd MetalSd None
## 521 2000 Gable CompShg MetalSd MetalSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 18 0 TA TA Slab <NA> <NA>
## 40 0 TA TA PConc <NA> <NA>
## 91 0 TA TA Slab <NA> <NA>
## 103 0 TA Fa Slab <NA> <NA>
## 157 0 TA TA CBlock <NA> <NA>
## 183 98 TA TA PConc <NA> <NA>
## 260 0 TA TA PConc <NA> <NA>
## 343 340 TA TA Slab <NA> <NA>
## 363 500 Gd TA Slab <NA> <NA>
## 372 0 TA TA CBlock <NA> <NA>
## 393 0 TA TA Slab <NA> <NA>
## 521 0 TA TA BrkTil <NA> <NA>
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 18 <NA> <NA> 0 <NA> 0
## 40 <NA> <NA> 0 <NA> 0
## 91 <NA> <NA> 0 <NA> 0
## 103 <NA> <NA> 0 <NA> 0
## 157 <NA> <NA> 0 <NA> 0
## 183 <NA> <NA> 0 <NA> 0
## 260 <NA> <NA> 0 <NA> 0
## 343 <NA> <NA> 0 <NA> 0
## 363 <NA> <NA> 0 <NA> 0
## 372 <NA> <NA> 0 <NA> 0
## 393 <NA> <NA> 0 <NA> 0
## 521 <NA> <NA> 0 <NA> 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 18 0 0 GasA TA Y SBrkr
## 40 0 0 GasA TA N FuseP
## 91 0 0 GasA TA Y FuseA
## 103 0 0 GasA TA Y SBrkr
## 157 0 0 GasA TA Y FuseF
## 183 0 0 GasA Ex Y SBrkr
## 260 0 0 GasA Gd Y FuseA
## 343 0 0 Wall Fa N FuseA
## 363 0 0 GasA Ex Y SBrkr
## 372 0 0 GasA TA Y SBrkr
## 393 0 0 GasA TA Y SBrkr
## 521 0 0 GasA TA N FuseA
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 18 1296 0 0 1296 0 0
## 40 1152 0 0 1152 0 0
## 91 1040 0 0 1040 0 0
## 103 1535 0 0 1535 0 0
## 157 1040 0 0 1040 0 0
## 183 1340 0 0 1340 0 0
## 260 882 0 0 882 0 0
## 343 1040 0 0 1040 0 0
## 363 495 1427 0 1922 0 0
## 372 1120 468 0 1588 0 0
## 393 882 0 0 882 0 0
## 521 694 600 0 1294 0 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 18 2 0 2 2 TA 6
## 40 2 0 2 2 Fa 6
## 91 1 0 2 1 TA 4
## 103 2 0 4 2 TA 8
## 157 1 0 2 1 TA 5
## 183 1 0 3 1 TA 7
## 260 1 0 2 1 TA 4
## 343 2 0 2 2 TA 6
## 363 3 0 4 1 Gd 7
## 372 2 0 4 1 TA 7
## 393 1 0 3 1 TA 5
## 521 2 0 3 2 TA 7
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 18 Typ 0 <NA> CarPort 1967 Unf
## 40 Typ 0 <NA> <NA> NA <NA>
## 91 Typ 0 <NA> Detchd 1950 Unf
## 103 Typ 0 <NA> Attchd 1979 Unf
## 157 Typ 0 <NA> Detchd 1950 Unf
## 183 Typ 1 Gd Attchd 1957 RFn
## 260 Typ 0 <NA> Detchd 1956 Unf
## 343 Typ 0 <NA> Detchd 1949 Unf
## 363 Typ 1 Ex BuiltIn 2003 RFn
## 372 Min2 1 Gd Detchd 1991 Fin
## 393 Typ 0 <NA> Attchd 1959 RFn
## 521 Typ 0 <NA> <NA> NA <NA>
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 18 2 516 TA TA Y 0
## 40 0 0 <NA> <NA> N 0
## 91 2 420 TA TA Y 0
## 103 2 410 TA TA Y 0
## 157 2 625 TA TA Y 0
## 183 1 252 TA TA Y 116
## 260 1 308 TA TA Y 0
## 343 2 400 TA TA Y 0
## 363 2 672 TA TA Y 0
## 372 2 680 TA TA N 0
## 393 1 294 TA TA Y 0
## 521 0 0 <NA> <NA> N 220
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 18 0 0 0 0 0 <NA>
## 40 0 0 0 0 0 <NA>
## 91 29 0 0 0 0 <NA>
## 103 0 0 0 0 0 <NA>
## 157 0 0 0 0 0 <NA>
## 183 0 0 180 0 0 <NA>
## 260 45 0 0 0 0 <NA>
## 343 0 0 0 0 0 <NA>
## 363 0 177 0 0 0 <NA>
## 372 59 0 0 0 0 <NA>
## 393 0 0 0 0 0 <NA>
## 521 114 210 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 18 <NA> Shed 500 10 2006 WD Normal
## 40 <NA> <NA> 0 6 2008 WD AdjLand
## 91 <NA> <NA> 0 7 2006 WD Normal
## 103 <NA> <NA> 0 6 2009 WD Alloca
## 157 <NA> <NA> 0 6 2006 WD Normal
## 183 MnPrv <NA> 0 6 2007 WD Normal
## 260 <NA> <NA> 0 12 2008 WD Normal
## 343 <NA> <NA> 0 5 2006 WD Normal
## 363 <NA> <NA> 0 7 2009 ConLD Normal
## 372 <NA> <NA> 0 7 2008 WD Normal
## 393 MnPrv Shed 1200 7 2007 WD Normal
## 521 <NA> <NA> 0 8 2008 WD Normal
## SalePrice
## 18 90000
## 40 82000
## 91 109900
## 103 118964
## 157 109500
## 183 120000
## 260 97000
## 343 87500
## 363 198500
## 372 134432
## 393 106500
## 521 106250
## [ reached getOption("max.print") -- omitted 26 rows ]
Here we can see that they all overlap and there is a pattern here. They are all basement, perhaps there is somthing odd about the basement. Looking in the data dictionary confirms our suspicions, NA repersents No basements for all of there variables. These are not missing values, and NA is a very meaningful entry. I don’t like the use of NA, I would rather use a more descriptive categorical name: NoB
v = 1
while (v<1453){
if (is.na(modified_data$BsmtQual[v])){
modified_data$BsmtQual[v] = "NoB"
}
if (is.na(modified_data$BsmtCond[v])){
modified_data$BsmtCond[v] = "NoB"
}
if (is.na(modified_data$BsmtExposure[v])){
modified_data$BsmtExposure[v] = "NoB"
}
if (is.na(modified_data$BsmtFinType1[v])){
modified_data$BsmtFinType1[v] = "NoB"
}
if (is.na(modified_data$BsmtFinType2[v])){
modified_data$BsmtFinType2[v] = "NoB"
}
v = v + 1
}
Bsmt1 <- subset(modified_data, modified_data$BsmtExposure == "NoB")
Bsmt1
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 18 18 90 RL 72 10791 Pave None Reg
## 40 40 90 RL 65 6040 Pave None Reg
## 91 91 20 RL 60 7200 Pave None Reg
## 103 103 90 RL 64 7018 Pave None Reg
## 157 157 20 RL 60 7200 Pave None Reg
## 183 183 20 RL 60 9060 Pave None Reg
## 260 260 20 RM 70 12702 Pave None Reg
## 343 343 90 RL 0 8544 Pave None Reg
## 363 363 85 RL 64 7301 Pave None Reg
## 372 372 50 RL 80 17120 Pave None Reg
## 393 393 20 RL 0 8339 Pave None IR1
## 521 521 190 RL 60 10800 Pave Grvl Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 18 Lvl AllPub Inside Gtl Sawyer Norm
## 40 Lvl AllPub Inside Gtl Edwards Norm
## 91 Lvl AllPub Inside Gtl NAmes Norm
## 103 Bnk AllPub Inside Gtl SawyerW Norm
## 157 Lvl AllPub Inside Gtl NAmes Norm
## 183 Lvl AllPub Inside Gtl Edwards Artery
## 260 Lvl AllPub Inside Gtl OldTown Norm
## 343 Lvl AllPub Inside Gtl NAmes Norm
## 363 Lvl AllPub Corner Gtl Edwards Norm
## 372 Lvl AllPub Inside Gtl ClearCr Feedr
## 393 Lvl AllPub Inside Gtl NAmes Norm
## 521 Lvl AllPub Inside Gtl OldTown Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 18 Norm Duplex 1Story 4 5 1967
## 40 Norm Duplex 1Story 4 5 1955
## 91 Norm 1Fam 1Story 4 5 1950
## 103 Norm Duplex 1Story 5 5 1979
## 157 Norm 1Fam 1Story 5 7 1950
## 183 Norm 1Fam 1Story 5 6 1957
## 260 Norm 1Fam 1Story 5 5 1956
## 343 Norm Duplex 1Story 3 4 1949
## 363 Norm 1Fam SFoyer 7 5 2003
## 372 Norm 1Fam 1.5Fin 4 4 1959
## 393 Norm 1Fam 1Story 5 7 1959
## 521 Norm 2fmCon 2Story 4 7 1900
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 18 1967 Gable CompShg MetalSd MetalSd None
## 40 1955 Gable CompShg AsbShng Plywood None
## 91 1950 Gable CompShg BrkFace Wd Sdng None
## 103 1979 Gable CompShg HdBoard HdBoard None
## 157 1950 Hip CompShg Wd Sdng Wd Sdng None
## 183 2006 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 260 1956 Gable CompShg BrkFace BrkFace None
## 343 1950 Gable CompShg Stucco Stucco BrkFace
## 363 2003 Gable CompShg HdBoard HdBoard BrkFace
## 372 1959 Gable CompShg WdShing Plywood None
## 393 1959 Gable CompShg MetalSd MetalSd None
## 521 2000 Gable CompShg MetalSd MetalSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 18 0 TA TA Slab NoB NoB
## 40 0 TA TA PConc NoB NoB
## 91 0 TA TA Slab NoB NoB
## 103 0 TA Fa Slab NoB NoB
## 157 0 TA TA CBlock NoB NoB
## 183 98 TA TA PConc NoB NoB
## 260 0 TA TA PConc NoB NoB
## 343 340 TA TA Slab NoB NoB
## 363 500 Gd TA Slab NoB NoB
## 372 0 TA TA CBlock NoB NoB
## 393 0 TA TA Slab NoB NoB
## 521 0 TA TA BrkTil NoB NoB
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 18 NoB NoB 0 NoB 0
## 40 NoB NoB 0 NoB 0
## 91 NoB NoB 0 NoB 0
## 103 NoB NoB 0 NoB 0
## 157 NoB NoB 0 NoB 0
## 183 NoB NoB 0 NoB 0
## 260 NoB NoB 0 NoB 0
## 343 NoB NoB 0 NoB 0
## 363 NoB NoB 0 NoB 0
## 372 NoB NoB 0 NoB 0
## 393 NoB NoB 0 NoB 0
## 521 NoB NoB 0 NoB 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 18 0 0 GasA TA Y SBrkr
## 40 0 0 GasA TA N FuseP
## 91 0 0 GasA TA Y FuseA
## 103 0 0 GasA TA Y SBrkr
## 157 0 0 GasA TA Y FuseF
## 183 0 0 GasA Ex Y SBrkr
## 260 0 0 GasA Gd Y FuseA
## 343 0 0 Wall Fa N FuseA
## 363 0 0 GasA Ex Y SBrkr
## 372 0 0 GasA TA Y SBrkr
## 393 0 0 GasA TA Y SBrkr
## 521 0 0 GasA TA N FuseA
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 18 1296 0 0 1296 0 0
## 40 1152 0 0 1152 0 0
## 91 1040 0 0 1040 0 0
## 103 1535 0 0 1535 0 0
## 157 1040 0 0 1040 0 0
## 183 1340 0 0 1340 0 0
## 260 882 0 0 882 0 0
## 343 1040 0 0 1040 0 0
## 363 495 1427 0 1922 0 0
## 372 1120 468 0 1588 0 0
## 393 882 0 0 882 0 0
## 521 694 600 0 1294 0 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 18 2 0 2 2 TA 6
## 40 2 0 2 2 Fa 6
## 91 1 0 2 1 TA 4
## 103 2 0 4 2 TA 8
## 157 1 0 2 1 TA 5
## 183 1 0 3 1 TA 7
## 260 1 0 2 1 TA 4
## 343 2 0 2 2 TA 6
## 363 3 0 4 1 Gd 7
## 372 2 0 4 1 TA 7
## 393 1 0 3 1 TA 5
## 521 2 0 3 2 TA 7
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 18 Typ 0 <NA> CarPort 1967 Unf
## 40 Typ 0 <NA> <NA> NA <NA>
## 91 Typ 0 <NA> Detchd 1950 Unf
## 103 Typ 0 <NA> Attchd 1979 Unf
## 157 Typ 0 <NA> Detchd 1950 Unf
## 183 Typ 1 Gd Attchd 1957 RFn
## 260 Typ 0 <NA> Detchd 1956 Unf
## 343 Typ 0 <NA> Detchd 1949 Unf
## 363 Typ 1 Ex BuiltIn 2003 RFn
## 372 Min2 1 Gd Detchd 1991 Fin
## 393 Typ 0 <NA> Attchd 1959 RFn
## 521 Typ 0 <NA> <NA> NA <NA>
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 18 2 516 TA TA Y 0
## 40 0 0 <NA> <NA> N 0
## 91 2 420 TA TA Y 0
## 103 2 410 TA TA Y 0
## 157 2 625 TA TA Y 0
## 183 1 252 TA TA Y 116
## 260 1 308 TA TA Y 0
## 343 2 400 TA TA Y 0
## 363 2 672 TA TA Y 0
## 372 2 680 TA TA N 0
## 393 1 294 TA TA Y 0
## 521 0 0 <NA> <NA> N 220
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 18 0 0 0 0 0 <NA>
## 40 0 0 0 0 0 <NA>
## 91 29 0 0 0 0 <NA>
## 103 0 0 0 0 0 <NA>
## 157 0 0 0 0 0 <NA>
## 183 0 0 180 0 0 <NA>
## 260 45 0 0 0 0 <NA>
## 343 0 0 0 0 0 <NA>
## 363 0 177 0 0 0 <NA>
## 372 59 0 0 0 0 <NA>
## 393 0 0 0 0 0 <NA>
## 521 114 210 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 18 <NA> Shed 500 10 2006 WD Normal
## 40 <NA> <NA> 0 6 2008 WD AdjLand
## 91 <NA> <NA> 0 7 2006 WD Normal
## 103 <NA> <NA> 0 6 2009 WD Alloca
## 157 <NA> <NA> 0 6 2006 WD Normal
## 183 MnPrv <NA> 0 6 2007 WD Normal
## 260 <NA> <NA> 0 12 2008 WD Normal
## 343 <NA> <NA> 0 5 2006 WD Normal
## 363 <NA> <NA> 0 7 2009 ConLD Normal
## 372 <NA> <NA> 0 7 2008 WD Normal
## 393 MnPrv Shed 1200 7 2007 WD Normal
## 521 <NA> <NA> 0 8 2008 WD Normal
## SalePrice
## 18 90000
## 40 82000
## 91 109900
## 103 118964
## 157 109500
## 183 120000
## 260 97000
## 343 87500
## 363 198500
## 372 134432
## 393 106500
## 521 106250
## [ reached getOption("max.print") -- omitted 26 rows ]
Bsmt2 <- subset(modified_data, modified_data$BsmtFinType2 == "NoB")
Bsmt2
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 18 18 90 RL 72 10791 Pave None Reg
## 40 40 90 RL 65 6040 Pave None Reg
## 91 91 20 RL 60 7200 Pave None Reg
## 103 103 90 RL 64 7018 Pave None Reg
## 157 157 20 RL 60 7200 Pave None Reg
## 183 183 20 RL 60 9060 Pave None Reg
## 260 260 20 RM 70 12702 Pave None Reg
## 333 333 20 RL 85 10655 Pave None IR1
## 343 343 90 RL 0 8544 Pave None Reg
## 363 363 85 RL 64 7301 Pave None Reg
## 372 372 50 RL 80 17120 Pave None Reg
## 393 393 20 RL 0 8339 Pave None IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 18 Lvl AllPub Inside Gtl Sawyer Norm
## 40 Lvl AllPub Inside Gtl Edwards Norm
## 91 Lvl AllPub Inside Gtl NAmes Norm
## 103 Bnk AllPub Inside Gtl SawyerW Norm
## 157 Lvl AllPub Inside Gtl NAmes Norm
## 183 Lvl AllPub Inside Gtl Edwards Artery
## 260 Lvl AllPub Inside Gtl OldTown Norm
## 333 Lvl AllPub Inside Gtl NridgHt Norm
## 343 Lvl AllPub Inside Gtl NAmes Norm
## 363 Lvl AllPub Corner Gtl Edwards Norm
## 372 Lvl AllPub Inside Gtl ClearCr Feedr
## 393 Lvl AllPub Inside Gtl NAmes Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 18 Norm Duplex 1Story 4 5 1967
## 40 Norm Duplex 1Story 4 5 1955
## 91 Norm 1Fam 1Story 4 5 1950
## 103 Norm Duplex 1Story 5 5 1979
## 157 Norm 1Fam 1Story 5 7 1950
## 183 Norm 1Fam 1Story 5 6 1957
## 260 Norm 1Fam 1Story 5 5 1956
## 333 Norm 1Fam 1Story 8 5 2003
## 343 Norm Duplex 1Story 3 4 1949
## 363 Norm 1Fam SFoyer 7 5 2003
## 372 Norm 1Fam 1.5Fin 4 4 1959
## 393 Norm 1Fam 1Story 5 7 1959
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 18 1967 Gable CompShg MetalSd MetalSd None
## 40 1955 Gable CompShg AsbShng Plywood None
## 91 1950 Gable CompShg BrkFace Wd Sdng None
## 103 1979 Gable CompShg HdBoard HdBoard None
## 157 1950 Hip CompShg Wd Sdng Wd Sdng None
## 183 2006 Hip CompShg Wd Sdng Wd Sdng BrkFace
## 260 1956 Gable CompShg BrkFace BrkFace None
## 333 2004 Gable CompShg VinylSd VinylSd BrkFace
## 343 1950 Gable CompShg Stucco Stucco BrkFace
## 363 2003 Gable CompShg HdBoard HdBoard BrkFace
## 372 1959 Gable CompShg WdShing Plywood None
## 393 1959 Gable CompShg MetalSd MetalSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 18 0 TA TA Slab NoB NoB
## 40 0 TA TA PConc NoB NoB
## 91 0 TA TA Slab NoB NoB
## 103 0 TA Fa Slab NoB NoB
## 157 0 TA TA CBlock NoB NoB
## 183 98 TA TA PConc NoB NoB
## 260 0 TA TA PConc NoB NoB
## 333 296 Gd TA PConc Gd TA
## 343 340 TA TA Slab NoB NoB
## 363 500 Gd TA Slab NoB NoB
## 372 0 TA TA CBlock NoB NoB
## 393 0 TA TA Slab NoB NoB
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 18 NoB NoB 0 NoB 0
## 40 NoB NoB 0 NoB 0
## 91 NoB NoB 0 NoB 0
## 103 NoB NoB 0 NoB 0
## 157 NoB NoB 0 NoB 0
## 183 NoB NoB 0 NoB 0
## 260 NoB NoB 0 NoB 0
## 333 No GLQ 1124 NoB 479
## 343 NoB NoB 0 NoB 0
## 363 NoB NoB 0 NoB 0
## 372 NoB NoB 0 NoB 0
## 393 NoB NoB 0 NoB 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 18 0 0 GasA TA Y SBrkr
## 40 0 0 GasA TA N FuseP
## 91 0 0 GasA TA Y FuseA
## 103 0 0 GasA TA Y SBrkr
## 157 0 0 GasA TA Y FuseF
## 183 0 0 GasA Ex Y SBrkr
## 260 0 0 GasA Gd Y FuseA
## 333 1603 3206 GasA Ex Y SBrkr
## 343 0 0 Wall Fa N FuseA
## 363 0 0 GasA Ex Y SBrkr
## 372 0 0 GasA TA Y SBrkr
## 393 0 0 GasA TA Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 18 1296 0 0 1296 0 0
## 40 1152 0 0 1152 0 0
## 91 1040 0 0 1040 0 0
## 103 1535 0 0 1535 0 0
## 157 1040 0 0 1040 0 0
## 183 1340 0 0 1340 0 0
## 260 882 0 0 882 0 0
## 333 1629 0 0 1629 1 0
## 343 1040 0 0 1040 0 0
## 363 495 1427 0 1922 0 0
## 372 1120 468 0 1588 0 0
## 393 882 0 0 882 0 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 18 2 0 2 2 TA 6
## 40 2 0 2 2 Fa 6
## 91 1 0 2 1 TA 4
## 103 2 0 4 2 TA 8
## 157 1 0 2 1 TA 5
## 183 1 0 3 1 TA 7
## 260 1 0 2 1 TA 4
## 333 2 0 3 1 Gd 7
## 343 2 0 2 2 TA 6
## 363 3 0 4 1 Gd 7
## 372 2 0 4 1 TA 7
## 393 1 0 3 1 TA 5
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 18 Typ 0 <NA> CarPort 1967 Unf
## 40 Typ 0 <NA> <NA> NA <NA>
## 91 Typ 0 <NA> Detchd 1950 Unf
## 103 Typ 0 <NA> Attchd 1979 Unf
## 157 Typ 0 <NA> Detchd 1950 Unf
## 183 Typ 1 Gd Attchd 1957 RFn
## 260 Typ 0 <NA> Detchd 1956 Unf
## 333 Typ 1 Gd Attchd 2003 RFn
## 343 Typ 0 <NA> Detchd 1949 Unf
## 363 Typ 1 Ex BuiltIn 2003 RFn
## 372 Min2 1 Gd Detchd 1991 Fin
## 393 Typ 0 <NA> Attchd 1959 RFn
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 18 2 516 TA TA Y 0
## 40 0 0 <NA> <NA> N 0
## 91 2 420 TA TA Y 0
## 103 2 410 TA TA Y 0
## 157 2 625 TA TA Y 0
## 183 1 252 TA TA Y 116
## 260 1 308 TA TA Y 0
## 333 3 880 TA TA Y 0
## 343 2 400 TA TA Y 0
## 363 2 672 TA TA Y 0
## 372 2 680 TA TA N 0
## 393 1 294 TA TA Y 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 18 0 0 0 0 0 <NA>
## 40 0 0 0 0 0 <NA>
## 91 29 0 0 0 0 <NA>
## 103 0 0 0 0 0 <NA>
## 157 0 0 0 0 0 <NA>
## 183 0 0 180 0 0 <NA>
## 260 45 0 0 0 0 <NA>
## 333 0 0 0 0 0 <NA>
## 343 0 0 0 0 0 <NA>
## 363 0 177 0 0 0 <NA>
## 372 59 0 0 0 0 <NA>
## 393 0 0 0 0 0 <NA>
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 18 <NA> Shed 500 10 2006 WD Normal
## 40 <NA> <NA> 0 6 2008 WD AdjLand
## 91 <NA> <NA> 0 7 2006 WD Normal
## 103 <NA> <NA> 0 6 2009 WD Alloca
## 157 <NA> <NA> 0 6 2006 WD Normal
## 183 MnPrv <NA> 0 6 2007 WD Normal
## 260 <NA> <NA> 0 12 2008 WD Normal
## 333 <NA> <NA> 0 10 2009 WD Normal
## 343 <NA> <NA> 0 5 2006 WD Normal
## 363 <NA> <NA> 0 7 2009 ConLD Normal
## 372 <NA> <NA> 0 7 2008 WD Normal
## 393 MnPrv Shed 1200 7 2007 WD Normal
## SalePrice
## 18 90000
## 40 82000
## 91 109900
## 103 118964
## 157 109500
## 183 120000
## 260 97000
## 333 284000
## 343 87500
## 363 198500
## 372 134432
## 393 106500
## [ reached getOption("max.print") -- omitted 26 rows ]
It’s important to note that observation 949 has a basement that is unfinished and the Exposure is set to NA. This is a potential missing value, only because we know that the other categorical variables label this observation as UNFINSHED rather than NO BASEMENT. The exposture could have been set at No exposure, but rather was set to a level repersenting no basement. I think it’s safe to remove this data observation.
Also observation 333 has a signular basement with FinType2 as NA. FinType1 does have a value, however, and after looking through other observations, this stikes as very odd. FinType repersents the finishing of the basement, FinType1 repersents the first layer, and FinType2 repersents any additional layers (if there are any), however in the event there is one layer, other entries would have FinType2 as UNF or No Basement. I think it is safe to eliminate this observation.
modified_data <- modified_data[-c(949),]
modified_data <- modified_data[-c(333),]
Poof! The two records are gone.
For the misisng electrical data, we will be removing it, because NA has no meaning behind it and there needs to be an option for it.
modified_data <- subset(modified_data, !is.na(modified_data$Electrical))
Poof! It’s gone!
For FireplaceQu, I looked ahead at the data dictionary, and it clearly states all NAs means no Fireplace, so we can attribute this to a better categorical variable: NoF
v = 1
while (v<1450){
if (is.na(modified_data$FireplaceQu[v])){
modified_data$FireplaceQu[v] = "NoF"
}
v = v + 1
}
And now again we have 5 variables that describe the same part of the house, the garage (GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond), and they have equal amounts of NA (81). And to no suprise, NA for each of those variables means No garage. We’ll chnage this to NoG instead.
v = 1
while (v<1450){
if (is.na(modified_data$GarageType[v])){
modified_data$GarageType[v] = "NoG"
}
if (is.na(modified_data$GarageYrBlt[v])){
modified_data$GarageYrBlt[v] = "NoG"
}
if (is.na(modified_data$GarageFinish[v])){
modified_data$GarageFinish[v] = "NoG"
}
if (is.na(modified_data$GarageQual[v])){
modified_data$GarageQual[v] = "NoG"
}
if (is.na(modified_data$GarageCond[v])){
modified_data$GarageCond[v] = "NoG"
}
v = v + 1
}
I found it that the next three attributes had a lot of NA entires. So I looked into the data dictionary, they all repersent the missing item for the attribute. They are not missing values, so will not be excluded, but given better names. NA for PoolQc will be chnaged to NoP, NA for Fence will be NoF, and NA for MiscFeature will become NoM.
v = 1
while (v<1450){
if (is.na(modified_data$PoolQC[v])){
modified_data$PoolQC[v] = "NoP"
}
if (is.na(modified_data$Fence[v])){
modified_data$Fence[v] = "NoF"
}
if (is.na(modified_data$MiscFeature[v])){
modified_data$MiscFeature[v] = "NoM"
}
v = v + 1
}
We should be done will addressing missing values, lets check!
x = 1
cat("---NULL COUNT---\n")
## ---NULL COUNT---
while (x<81){
if(sum(is.na((modified_data[x]))>0)){
cat("Number of nulls in ",(colnames(modified_data[x])), ": ")
cat(sum(is.na(modified_data[x])), "\n")
}
x = x + 1
}
outlier_sales <- subset(modified_data, modified_data$SalePrice > 340000)
outlier_sales[order(outlier_sales$SalePrice),]
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 719 719 60 RL 96 10542 Pave None Reg
## 321 321 60 RL 111 16259 Pave None Reg
## 12 12 60 RL 85 11924 Pave None IR1
## 643 643 80 RL 75 13860 Pave None Reg
## 991 991 60 RL 82 9452 Pave None Reg
## 655 655 20 RL 91 10437 Pave None IR1
## 878 878 60 RL 74 8834 Pave None Reg
## 322 322 60 RL 99 12099 Pave None IR1
## 609 609 70 RL 78 12168 Pave None Reg
## 310 310 20 RL 90 12378 Pave None IR1
## 703 703 60 RL 82 12438 Pave None IR1
## 1229 1229 120 RL 65 8769 Pave None Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 719 Lvl AllPub Inside Gtl NoRidge Norm
## 321 Lvl AllPub Corner Gtl NridgHt Norm
## 12 Lvl AllPub Inside Gtl NridgHt Norm
## 643 Lvl AllPub Inside Gtl NAmes Norm
## 991 Lvl AllPub Inside Gtl NoRidge Norm
## 655 Lvl AllPub Inside Gtl NoRidge Norm
## 878 Lvl AllPub Inside Gtl NridgHt Norm
## 322 Lvl AllPub Inside Gtl NridgHt Norm
## 609 HLS AllPub Inside Mod Crawfor Norm
## 310 Lvl AllPub Inside Gtl NridgHt Norm
## 703 Lvl AllPub Inside Gtl StoneBr Norm
## 1229 Lvl AllPub Corner Gtl NridgHt Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 719 Norm 1Fam 2Story 7 5 1993
## 321 Norm 1Fam 2Story 9 5 2006
## 12 Norm 1Fam 2Story 9 5 2005
## 643 Norm 1Fam SLvl 8 7 1972
## 991 Norm 1Fam 2Story 8 5 1997
## 655 Norm 1Fam 1Story 8 6 1995
## 878 Norm 1Fam 2Story 9 5 2004
## 322 Norm 1Fam 2Story 8 5 2004
## 609 Norm 1Fam 2Story 8 6 1934
## 310 Norm 1Fam 1Story 9 5 2003
## 703 Norm 1Fam 2Story 8 5 2006
## 1229 Norm TwnhsE 1Story 9 5 2008
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 719 1994 Hip CompShg Wd Sdng ImStucc BrkFace
## 321 2006 Gable CompShg VinylSd VinylSd Stone
## 12 2006 Hip CompShg WdShing Wd Shng Stone
## 643 1995 Gable CompShg Plywood Wd Sdng None
## 991 1998 Gable CompShg VinylSd VinylSd BrkFace
## 655 1995 Hip CompShg MetalSd MetalSd BrkFace
## 878 2005 Hip CompShg VinylSd VinylSd Stone
## 322 2004 Gable CompShg VinylSd VinylSd BrkFace
## 609 1998 Gable CompShg BrkFace Wd Sdng None
## 310 2004 Gable CompShg VinylSd VinylSd None
## 703 2006 Hip CompShg VinylSd VinylSd BrkFace
## 1229 2008 Hip CompShg MetalSd MetalSd BrkFace
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 719 651 Gd TA PConc Gd TA
## 321 370 TA TA PConc Ex Gd
## 12 286 Ex TA PConc Ex TA
## 643 0 Gd TA CBlock Gd TA
## 991 423 Gd TA PConc Gd TA
## 655 660 Gd Gd PConc Gd TA
## 878 216 Gd TA PConc Ex TA
## 322 388 Gd TA PConc Ex TA
## 609 0 TA TA PConc Gd TA
## 310 0 Gd TA PConc Ex TA
## 703 466 Ex TA PConc Ex Gd
## 1229 766 Ex TA PConc Ex TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 719 Gd GLQ 1173 Unf 0
## 321 Av Unf 0 Unf 0
## 12 No GLQ 998 Unf 0
## 643 Gd GLQ 1410 Unf 0
## 991 No GLQ 1074 Unf 0
## 655 Gd GLQ 1696 Unf 0
## 878 No GLQ 1170 Unf 0
## 322 Av GLQ 970 Unf 0
## 609 Mn BLQ 428 Unf 0
## 310 Gd GLQ 1274 Unf 0
## 703 No Unf 0 Unf 0
## 1229 No GLQ 1540 Unf 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 719 138 1311 GasA Ex Y SBrkr
## 321 1249 1249 GasA Ex Y SBrkr
## 12 177 1175 GasA Ex Y SBrkr
## 643 542 1952 GasA Gd Y SBrkr
## 991 322 1396 GasA Ex Y SBrkr
## 655 413 2109 GasA Ex Y SBrkr
## 878 292 1462 GasA Ex Y SBrkr
## 322 166 1136 GasA Ex Y SBrkr
## 609 537 965 GasA TA Y SBrkr
## 310 622 1896 GasA Ex Y SBrkr
## 703 1234 1234 GasA Ex Y SBrkr
## 1229 162 1702 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 719 1325 1093 0 2418 1 0
## 321 1249 1347 0 2596 0 0
## 12 1182 1142 0 2324 1 0
## 643 2000 704 0 2704 1 0
## 991 1407 985 0 2392 1 0
## 655 2113 0 0 2113 1 0
## 878 1462 762 0 2224 1 0
## 322 1136 1332 0 2468 1 0
## 609 1940 1254 0 3194 0 0
## 310 1944 0 0 1944 1 0
## 703 1264 1312 0 2576 0 0
## 1229 1702 0 0 1702 1 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 719 2 1 3 1 Gd 9
## 321 3 1 4 1 Gd 9
## 12 3 0 4 1 Ex 11
## 643 2 1 4 1 Ex 9
## 991 2 1 3 1 Gd 7
## 655 2 1 2 1 Gd 7
## 878 2 1 4 1 Ex 10
## 322 2 1 4 1 Gd 10
## 609 2 1 4 1 TA 10
## 310 2 0 3 1 Ex 8
## 703 2 1 4 1 Ex 10
## 1229 1 1 1 1 Ex 7
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 719 Typ 1 TA Attchd 1993 RFn
## 321 Typ 0 NoF Attchd 2006 RFn
## 12 Typ 2 Gd BuiltIn 2005 Fin
## 643 Typ 3 TA Attchd 1972 Fin
## 991 Typ 1 TA Attchd 1997 Fin
## 655 Typ 1 TA Attchd 1995 Fin
## 878 Typ 1 Gd Attchd 2004 Fin
## 322 Typ 1 Gd BuiltIn 2004 Fin
## 609 Typ 2 Gd Basment 1934 Unf
## 310 Typ 3 Ex Attchd 2003 Fin
## 703 Typ 1 Gd BuiltIn 2006 Fin
## 1229 Typ 1 Gd Attchd 2008 Fin
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 719 3 983 TA TA Y 250
## 321 3 840 TA TA Y 240
## 12 3 736 TA TA Y 147
## 643 2 538 TA TA Y 269
## 991 3 870 TA TA Y 0
## 655 3 839 TA TA Y 236
## 878 3 738 TA TA Y 184
## 322 3 872 TA TA Y 184
## 609 2 380 TA TA Y 0
## 310 3 708 TA TA Y 208
## 703 3 666 TA TA Y 324
## 1229 3 1052 TA TA Y 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 719 154 216 0 0 0 NoP
## 321 154 0 0 0 0 NoP
## 12 21 0 0 0 0 NoP
## 643 111 0 0 0 0 NoP
## 991 70 0 0 0 0 NoP
## 655 46 0 0 0 0 NoP
## 878 0 0 0 0 0 NoP
## 322 154 0 0 0 0 NoP
## 609 0 0 0 0 0 NoP
## 310 175 0 0 0 0 NoP
## 703 100 0 0 0 0 NoP
## 1229 72 0 0 224 0 NoP
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 719 NoF NoM 0 8 2008 WD Normal
## 321 NoF NoM 0 9 2006 New Partial
## 12 NoF NoM 0 7 2006 New Partial
## 643 MnPrv NoM 0 7 2009 WD Normal
## 991 NoF NoM 0 6 2006 WD Normal
## 655 NoF NoM 0 8 2008 WD Normal
## 878 NoF NoM 0 6 2009 WD Normal
## 322 NoF NoM 0 6 2007 WD Normal
## 609 NoF NoM 0 9 2007 WD Alloca
## 310 NoF NoM 0 11 2006 WD Normal
## 703 NoF NoM 0 7 2006 New Partial
## 1229 NoF NoM 0 10 2008 New Partial
## SalePrice
## 719 341000
## 321 342643
## 12 345000
## 643 345000
## 991 348000
## 655 350000
## 878 350000
## 322 354000
## 609 359100
## 310 360000
## 703 361919
## 1229 367294
## [ reached getOption("max.print") -- omitted 48 rows ]
modified_data$MSSubClass <- formatC(modified_data$MSSubClass)
modified_data$BsmtFullBath <- formatC(modified_data$BsmtFullBath)
modified_data$BsmtHalfBath <- formatC(modified_data$BsmtHalfBath)
modified_data$FullBath <- formatC(modified_data$FullBath)
modified_data$HalfBath <- formatC(modified_data$HalfBath)
modified_data$BedroomAbvGr <- formatC(modified_data$BedroomAbvGr)
modified_data$KitchenAbvGr <- formatC(modified_data$KitchenAbvGr)
modified_data$TotRmsAbvGrd <- formatC(modified_data$TotRmsAbvGrd)
modified_data$Fireplaces <- formatC(modified_data$Fireplaces)
modified_data$MoSold <- formatC(modified_data$MoSold)
modified_data$YrSold <- formatC(modified_data$YrSold)
#modified_data$OverallQual <- formatC(modified_data$OverallQual)
#modified_data$OverallCond <- formatC(modified_data$OverallCond)
# I will be using temp as an alt to the actual modified dataset until everything is finalized.
colname <- colnames(modified_data)
temp = modified_data
#remeber which attributes to remove
toremove = as.vector(0)
outers= as.vector(0)
gg = 0
attrib = 0
global = 0
This will be a strainious process, but I will be going thru each variable. we shall see if there is a need to reduce the levels for the categorical ones, and if outliers need to be dealt with for the numerical ones.
attrib = attrib + 1
colname[attrib]
## [1] "Id"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 0
## numeric(0)
## [1] Id MSSubClass MSZoning LotFrontage LotArea
## [6] Street Alley LotShape LandContour Utilities
## [11] LotConfig LandSlope Neighborhood Condition1 Condition2
## [16] BldgType HouseStyle OverallQual OverallCond YearBuilt
## [21] YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## [26] MasVnrType MasVnrArea ExterQual ExterCond Foundation
## [31] BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## [36] BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## [41] HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## [46] LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## [51] HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## [56] Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## [61] GarageFinish GarageCars GarageArea GarageQual GarageCond
## [66] PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## [71] ScreenPorch PoolArea PoolQC Fence MiscFeature
## [76] MiscVal MoSold YrSold SaleType SaleCondition
## [81] SalePrice
## <0 rows> (or 0-length row.names)
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
Obviously there is no outlier problem with our ID variables. Nothing to be removed.
attrib = attrib + 1
colname[attrib]
## [1] "MSSubClass"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels
temp$MSSubClass <- factor(temp$MSSubClass)
levels(temp$MSSubClass) <- list(One_Story=c("20","30","40","45", "50"), Two_Story=c("60","70","75"), SplitDuplex=c("80","85","90"), PUD=c("120","160","180","190"))
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
## Var1 Freq
## 1 120 85
## 2 160 63
## 3 180 10
## 4 190 30
## 5 20 532
## 6 30 69
## 7 40 4
## 8 45 12
## 9 50 144
## 10 60 296
## 11 70 60
## 12 75 16
## 13 80 57
## 14 85 19
## 15 90 52
## Var1 Freq
## 1 One_Story 761
## 2 Two_Story 372
## 3 SplitDuplex 128
## 4 PUD 188
Reduced the number of factors
attrib = attrib + 1
colname[attrib]
## [1] "MSZoning"
if (typeof(temp[,attrib]) == 'integer'){
graph = boxplot(temp[,attrib])
length(graph$out)
graph$out
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 C (all) 10
## 2 FV 62
## 3 RH 16
## 4 RL 1144
## 5 RM 217
I decided that the number of categorical factors are appropriate, and do not need to reduce. However Judging by the variance, Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "LotFrontage"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotFrontage = subset(temp, temp[,attrib]>graph$stats[5])
gg = gg+1
outers[gg] = attrib
}
## [1] 16
## [1] 141 174 174 140 150 137 144 149 313 168 182 138 160 152 313 153
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
I flagged all the potential outliers into the var outlier. WIll not remove yet because i might just be removing this attribute all together.
attrib = attrib + 1
colname[attrib]
## [1] "LotArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1],)
temp[outlierID,]
outlier_lotArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 67
## [1] 50271 19900 21000 21453 19378 31770 22950 25419 159000 19296
## [11] 39104 19138 18386 215245 164660 20431 18800 53107 34650 22420
## [21] 21750 70761 53227 40094 21872 21780 25095 46589 20896 18450
## [31] 21535 26178 115149 21695 53504 21384 28698 45600 17920 25286
## [41] 27650 24090 25000 1300 21286 21750 29959 18000 23257 17755
## [51] 35760 18030 35133 32463 18890 24682 23595 17871 36500 63887
## [61] 20781 25339 57200 20544 19690 21930 26142
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Street"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Grvl 6
## 2 Pave 1443
Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "Alley"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Grvl 50
## 2 None 1359
## 3 Pave 40
Probably will be looking to remove this one. The distribution is too skewed. Variance looks too little.
attrib = attrib + 1
colname[attrib]
## [1] "LotShape"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# Reassigning levels
temp$LotShape <- factor(temp$LotShape)
levels(temp$LotShape) <- list(IR=c("IR1","IR2","IR3"), Reg="Reg")
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
# global = global + 1
# toremove[global] = attrib
}
## Var1 Freq
## 1 IR1 482
## 2 IR2 41
## 3 IR3 10
## 4 Reg 916
## Var1 Freq
## 1 IR 533
## 2 Reg 916
After looking at the distribution (variances) in each level and how closely related IR1,IR2,IR3 were to each other, I decided to group them. This will level the distribution a bit.
attrib = attrib + 1
colname[attrib]
## [1] "LandContour"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Bnk 63
## 2 HLS 50
## 3 Low 36
## 4 Lvl 1300
Will be dropped for sure.
attrib = attrib + 1
colname[attrib]
## [1] "Utilities"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 AllPub 1448
## 2 NoSeWa 1
Will be dropped for sure.
attrib = attrib + 1
colname[attrib]
## [1] "LotConfig"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Corner 262
## 2 CulDSac 93
## 3 FR2 47
## 4 FR3 3
## 5 Inside 1044
Will drop.
attrib = attrib + 1
colname[attrib]
## [1] "LandSlope"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Gtl 1371
## 2 Mod 65
## 3 Sev 13
Will drop
attrib = attrib + 1
colname[attrib]
## [1] "Neighborhood"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Blmngtn 17
## 2 Blueste 2
## 3 BrDale 16
## 4 BrkSide 58
## 5 ClearCr 28
## 6 CollgCr 148
## 7 Crawfor 50
## 8 Edwards 100
## 9 Gilbert 78
## 10 IDOTRR 37
## 11 MeadowV 17
## 12 Mitchel 49
## 13 NAmes 225
## 14 NoRidge 41
## 15 NPkVill 9
## 16 NridgHt 75
## 17 NWAmes 73
## 18 OldTown 113
## 19 Sawyer 74
## 20 SawyerW 58
## 21 Somerst 83
## 22 StoneBr 25
## 23 SWISU 25
## 24 Timber 37
## 25 Veenker 11
Will not touch this because i feel it will be very important.
attrib = attrib + 1
colname[attrib]
## [1] "Condition1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Artery 48
## 2 Feedr 81
## 3 Norm 1249
## 4 PosA 8
## 5 PosN 19
## 6 RRAe 11
## 7 RRAn 26
## 8 RRNe 2
## 9 RRNn 5
Distribution is very bad, will drop
attrib = attrib + 1
colname[attrib]
## [1] "Condition2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Artery 2
## 2 Feedr 6
## 3 Norm 1434
## 4 PosA 1
## 5 PosN 2
## 6 RRAe 1
## 7 RRAn 1
## 8 RRNn 2
Even worse vairance, will drop.
attrib = attrib + 1
colname[attrib]
## [1] "BldgType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 1Fam 1211
## 2 2fmCon 31
## 3 Duplex 52
## 4 Twnhs 43
## 5 TwnhsE 112
I like the idea of this attribute, unfortunatly the vaiance is not within my ruleset
attrib = attrib + 1
colname[attrib]
## [1] "HouseStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1.5Fin 154
## 2 1.5Unf 14
## 3 1Story 720
## 4 2.5Fin 8
## 5 2.5Unf 11
## 6 2Story 442
## 7 SFoyer 36
## 8 SLvl 64
Although the distribution is not that good, we shall leave it as is because we want to perserve the information it gives. Perhaps later we will remove it.
attrib = attrib + 1
colname[attrib]
## [1] "OverallQual"
if (colname[attrib] == 'OverallQual'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 2
## 2 2 3
## 3 3 20
## 4 4 116
## 5 5 395
## 6 6 372
## 7 7 314
## 8 8 167
## 9 9 43
## 10 10 17
if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallQual'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
There is no reason for us to be chnaging this. Changing to quality ranges would not help the distribution (ex 1-3, 4-7, 8-10). Will keep as is.
attrib = attrib + 1
colname[attrib]
## [1] "OverallCond"
if (colname[attrib] == 'OverallCond'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 1
## 2 2 5
## 3 3 24
## 4 4 57
## 5 5 813
## 6 6 251
## 7 7 205
## 8 8 71
## 9 9 22
if (typeof(temp[,attrib]) == 'integer' & colname[attrib] != 'OverallCond'){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
Again, the distribution does not look too good towards the extremes, but there is no way to fix this without removing information. Will keep
attrib = attrib + 1
colname[attrib]
## [1] "YearBuilt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_yearbuilt = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 7
## [1] 1880 1880 1880 1882 1880 1875 1872
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "YearRemodAdd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 0
## numeric(0)
## [1] Id MSSubClass MSZoning LotFrontage LotArea
## [6] Street Alley LotShape LandContour Utilities
## [11] LotConfig LandSlope Neighborhood Condition1 Condition2
## [16] BldgType HouseStyle OverallQual OverallCond YearBuilt
## [21] YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## [26] MasVnrType MasVnrArea ExterQual ExterCond Foundation
## [31] BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## [36] BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## [41] HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## [46] LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## [51] HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## [56] Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## [61] GarageFinish GarageCars GarageArea GarageQual GarageCond
## [66] PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## [71] ScreenPorch PoolArea PoolQC Fence MiscFeature
## [76] MiscVal MoSold YrSold SaleType SaleCondition
## [81] SalePrice
## <0 rows> (or 0-length row.names)
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "RoofStyle"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Flat 13
## 2 Gable 1131
## 3 Gambrel 11
## 4 Hip 285
## 5 Mansard 7
## 6 Shed 2
attrib = attrib + 1
colname[attrib]
## [1] "RoofMatl"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 ClyTile 1
## 2 CompShg 1423
## 3 Membran 1
## 4 Metal 1
## 5 Roll 1
## 6 Tar&Grv 11
## 7 WdShake 5
## 8 WdShngl 6
will remove this this attrib
attrib = attrib + 1
colname[attrib]
## [1] "Exterior1st"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 AsbShng 20
## 2 AsphShn 1
## 3 BrkComm 2
## 4 BrkFace 50
## 5 CBlock 1
## 6 CemntBd 59
## 7 HdBoard 222
## 8 ImStucc 1
## 9 MetalSd 220
## 10 Plywood 108
## 11 Stone 2
## 12 Stucco 25
## 13 VinylSd 508
## 14 Wd Sdng 205
## 15 WdShing 25
attrib = attrib + 1
colname[attrib]
## [1] "Exterior2nd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 AsbShng 20
## 2 AsphShn 3
## 3 Brk Cmn 7
## 4 BrkFace 25
## 5 CBlock 1
## 6 CmentBd 58
## 7 HdBoard 206
## 8 ImStucc 10
## 9 MetalSd 214
## 10 Other 1
## 11 Plywood 142
## 12 Stone 4
## 13 Stucco 26
## 14 VinylSd 497
## 15 Wd Sdng 197
## 16 Wd Shng 38
attrib = attrib + 1
colname[attrib]
## [1] "MasVnrType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 BrkCmn 15
## 2 BrkFace 445
## 3 None 862
## 4 Stone 127
attrib = attrib + 1
colname[attrib]
## [1] "MasVnrArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_masVnrArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 96
## [1] 640 650 456 1031 573 1115 576 443 468 600 768 480 1129 436
## [15] 456 664 653 491 748 456 922 506 604 472 481 1600 616 870
## [29] 530 500 510 650 432 473 772 435 562 921 762 594 479 584
## [43] 420 459 452 513 472 660 528 464 1170 630 466 651 442 894
## [57] 513 673 603 860 424 1047 442 816 760 541 423 424 975 450
## [71] 423 571 480 425 660 1378 456 425 420 766 554 632 567 451
## [85] 621 788 796 428 564 579 705 731 420 448 426 438
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ExterQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 51
## 2 Fa 14
## 3 Gd 480
## 4 TA 904
attrib = attrib + 1
colname[attrib]
## [1] "ExterCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 3
## 2 Fa 28
## 3 Gd 145
## 4 Po 1
## 5 TA 1272
attrib = attrib + 1
colname[attrib]
## [1] "Foundation"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 BrkTil 146
## 2 CBlock 633
## 3 PConc 637
## 4 Slab 24
## 5 Stone 6
## 6 Wood 3
attrib = attrib + 1
colname[attrib]
## [1] "BsmtQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 120
## 2 Fa 35
## 3 Gd 609
## 4 NoB 37
## 5 TA 648
attrib = attrib + 1
colname[attrib]
## [1] "BsmtCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Fa 45
## 2 Gd 64
## 3 NoB 37
## 4 Po 2
## 5 TA 1301
Might drop too
attrib = attrib + 1
colname[attrib]
## [1] "BsmtExposure"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Av 219
## 2 Gd 133
## 3 Mn 114
## 4 No 945
## 5 NoB 38
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 ALQ 220
## 2 BLQ 148
## 3 GLQ 411
## 4 LwQ 74
## 5 NoB 37
## 6 Rec 132
## 7 Unf 427
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF1"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF1 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 8
## [1] 1810 1880 1904 1767 2260 2188 2096 5644
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinType2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 ALQ 19
## 2 BLQ 33
## 3 GLQ 14
## 4 LwQ 46
## 5 NoB 38
## 6 Rec 54
## 7 Unf 1245
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFinSF2"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtFinSF2 = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 167
## [1] 32 668 486 93 491 506 712 362 41 169 869 150 670 28
## [15] 1080 181 768 215 374 208 441 184 279 306 180 712 580 690
## [29] 692 228 125 1063 620 175 820 1474 264 479 147 232 380 544
## [43] 294 258 121 180 391 531 344 539 713 210 311 1120 165 532
## [57] 279 96 495 180 174 1127 139 202 645 123 551 219 606 147
## [71] 612 480 182 132 336 468 287 35 499 180 180 723 119 182
## [85] 40 551 117 239 80 472 64 1057 127 630 480 128 377 764
## [99] 345 539 1085 435 823 500 290 324 634 411 841 1061 93 466
## [113] 396 354 294 149 193 117 273 465 400 468 41 682 64 557
## [127] 230 106 791 240 287 547 391 469 177 108 374 600 492 211
## [141] 168 96 1031 438 375 144 81 906 608 276 661 68 173 972
## [155] 105 420 469 546 334 352 872 374 110 627 163 1029 290
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtUnfSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bsmtUnfSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 29
## [1] 1777 1768 1907 1686 2336 1694 2121 1869 2153 1969 1709 2042 1774 2046
## [15] 1836 1935 1926 1734 1800 1753 1905 1800 1710 1752 1694 1689 2002 1753
## [29] 1795
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "TotalBsmtSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_totalBsmtSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 60
## [1] 0 0 2223 0 0 0 2216 0 2392 0 2121 2136 3206 0
## [15] 0 0 0 3094 2153 3200 0 3138 0 0 0 0 2109 2077
## [29] 2444 0 0 0 0 2078 0 2217 0 0 2330 0 0 0
## [43] 0 2524 0 0 0 0 0 2396 2158 0 0 2136 0 2110
## [57] 6110 0 2633 0
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "Heating"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Floor 1
## 2 GasA 1417
## 3 GasW 18
## 4 Grav 7
## 5 OthW 2
## 6 Wall 4
attrib = attrib + 1
colname[attrib]
## [1] "HeatingQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 733
## 2 Fa 49
## 3 Gd 239
## 4 Po 1
## 5 TA 427
attrib = attrib + 1
colname[attrib]
## [1] "CentralAir"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 N 95
## 2 Y 1354
attrib = attrib + 1
colname[attrib]
## [1] "Electrical"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 FuseA 94
## 2 FuseF 27
## 3 FuseP 3
## 4 Mix 1
## 5 SBrkr 1324
attrib = attrib + 1
colname[attrib]
## [1] "X1stFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x1stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 19
## [1] 2207 2223 2259 2158 2234 2392 2402 3228 3138 2444 2217 2364 2898 2524
## [15] 2411 2196 4692 2156 2633
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X2ndFlrSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_x2stFlrSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 2
## [1] 1872 2065
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "LowQualFinSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_lowQualFinSF = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 26
## [1] 360 513 234 528 572 144 392 371 390 420 473 156 515 360 80 80 53
## [18] 232 481 120 514 397 479 205 80 384
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GrLivArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_grLivArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 31
## [1] 2945 3222 3608 3112 2794 3493 2978 3228 4676 2775 3194 3395 4316 3279
## [15] 3140 2822 2872 2898 3082 2868 2828 3627 3086 2872 4476 3447 5642 2810
## [29] 2792 3238 2784
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "BsmtFullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 852
## 2 1 581
## 3 2 15
## 4 3 1
attrib = attrib + 1
colname[attrib]
## [1] "BsmtHalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 1368
## 2 1 79
## 3 2 2
attrib = attrib + 1
colname[attrib]
## [1] "FullBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 9
## 2 1 648
## 3 2 760
## 4 3 32
attrib = attrib + 1
colname[attrib]
## [1] "HalfBath"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 908
## 2 1 529
## 3 2 12
attrib = attrib + 1
colname[attrib]
## [1] "BedroomAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 6
## 2 1 49
## 3 2 354
## 4 3 800
## 5 4 211
## 6 5 21
## 7 6 7
## 8 8 1
attrib = attrib + 1
colname[attrib]
## [1] "KitchenAbvGr"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 1
## 2 1 1382
## 3 2 64
## 4 3 2
attrib = attrib + 1
colname[attrib]
## [1] "KitchenQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_bedroomAbvGr = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 99
## 2 Fa 39
## 3 Gd 578
## 4 TA 733
attrib = attrib + 1
colname[attrib]
## [1] "TotRmsAbvGrd"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 10 47
## 2 11 18
## 3 12 11
## 4 14 1
## 5 2 1
## 6 3 17
## 7 4 96
## 8 5 273
## 9 6 400
## 10 7 325
## 11 8 186
## 12 9 74
attrib = attrib + 1
colname[attrib]
## [1] "Functional"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Maj1 13
## 2 Maj2 5
## 3 Min1 31
## 4 Min2 34
## 5 Mod 15
## 6 Sev 1
## 7 Typ 1350
attrib = attrib + 1
colname[attrib]
## [1] "Fireplaces"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 0 684
## 2 1 647
## 3 2 113
## 4 3 5
attrib = attrib + 1
colname[attrib]
## [1] "FireplaceQu"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 24
## 2 Fa 33
## 3 Gd 377
## 4 NoF 684
## 5 Po 20
## 6 TA 311
attrib = attrib + 1
colname[attrib]
## [1] "GarageType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 2Types 6
## 2 Attchd 862
## 3 Basment 19
## 4 BuiltIn 86
## 5 CarPort 9
## 6 Detchd 386
## 7 NoG 81
attrib = attrib + 1
colname[attrib]
## [1] "GarageYrBlt"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1900 1
## 2 1906 1
## 3 1908 1
## 4 1910 3
## 5 1914 2
## 6 1915 2
## 7 1916 5
## 8 1918 2
## 9 1920 14
## 10 1921 3
## 11 1922 5
## 12 1923 3
## 13 1924 3
## 14 1925 10
## 15 1926 6
## 16 1927 1
## 17 1928 4
## 18 1929 2
## 19 1930 8
## 20 1931 4
## 21 1932 3
## 22 1933 1
## 23 1934 2
## 24 1935 4
## 25 1936 5
## 26 1937 2
## 27 1938 3
## 28 1939 9
## 29 1940 14
## 30 1941 10
## 31 1942 2
## 32 1945 4
## 33 1946 4
## 34 1947 2
## 35 1948 11
## 36 1949 8
## 37 1950 24
## 38 1951 6
## 39 1952 3
## 40 1953 12
## 41 1954 19
## 42 1955 13
## 43 1956 16
## 44 1957 20
## 45 1958 21
## 46 1959 17
## 47 1960 19
## 48 1961 13
## 49 1962 21
## 50 1963 16
## 51 1964 18
## 52 1965 21
## 53 1966 21
## 54 1967 15
## 55 1968 26
## 56 1969 15
## 57 1970 20
## 58 1971 13
## 59 1972 14
## 60 1973 14
## 61 1974 17
## 62 1975 8
## 63 1976 29
## 64 1977 35
## 65 1978 19
## 66 1979 15
## 67 1980 15
## 68 1981 10
## 69 1982 4
## 70 1983 7
## 71 1984 8
## 72 1985 10
## 73 1986 6
## 74 1987 11
## 75 1988 14
## 76 1989 10
## 77 1990 16
## 78 1991 9
## 79 1992 13
## 80 1993 22
## 81 1994 18
## 82 1995 18
## 83 1996 20
## 84 1997 19
## 85 1998 31
## 86 1999 30
## 87 2000 27
## 88 2001 20
## 89 2002 24
## 90 2003 49
## 91 2004 52
## 92 2005 65
## 93 2006 58
## 94 2007 45
## 95 2008 29
## 96 2009 21
## 97 2010 3
## 98 NoG 81
I will not remove this because it shows a pattern.
attrib = attrib + 1
colname[attrib]
## [1] "GarageFinish"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Fin 346
## 2 NoG 81
## 3 RFn 417
## 4 Unf 605
attrib = attrib + 1
colname[attrib]
## [1] "GarageCars"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garagecars = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 5
## [1] 4 4 4 4 4
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_garageArea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 20
## [1] 1166 968 1053 1025 1390 1134 983 1020 1220 1248 1043 1052 995 1356
## [15] 1052 954 1014 1418 968 1069
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "GarageQual"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 3
## 2 Fa 48
## 3 Gd 14
## 4 NoG 81
## 5 Po 3
## 6 TA 1300
attrib = attrib + 1
colname[attrib]
## [1] "GarageCond"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 2
## 2 Fa 35
## 3 Gd 9
## 4 NoG 81
## 5 Po 7
## 6 TA 1315
attrib = attrib + 1
colname[attrib]
## [1] "PavedDrive"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 N 90
## 2 P 30
## 3 Y 1329
attrib = attrib + 1
colname[attrib]
## [1] "WoodDeckSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_woodDeck = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 32
## [1] 857 576 476 574 441 468 670 495 536 519 466 517 426 503 486 486 511
## [18] 421 550 509 474 728 436 431 448 439 635 500 668 586 431 736
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "OpenPorchSF"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_openPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 75
## [1] 204 213 258 199 234 184 205 228 238 260 198 172 208 228 184 250 175
## [18] 195 214 231 192 187 176 523 285 406 182 502 274 172 243 235 312 267
## [35] 265 288 341 204 174 247 291 312 418 240 364 188 207 234 192 191 252
## [52] 189 282 224 319 244 185 200 180 263 304 234 240 192 198 287 292 207
## [69] 241 547 211 184 262 210 236
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "EnclosedPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
## [1] 207
## [1] 272 228 205 176 205 87 172 102 37 144 64 114 202 128 156 44 77
## [18] 144 192 144 140 180 228 128 183 39 184 40 552 30 126 96 60 150
## [35] 120 202 77 112 252 52 224 234 144 244 268 137 24 108 294 177 218
## [52] 242 91 112 160 130 184 126 169 105 34 96 248 236 120 32 80 115
## [69] 291 184 116 158 112 210 36 156 144 84 148 116 120 136 102 240 54
## [86] 112 39 100 36 189 293 164 40 216 239 112 252 240 180 67 90 120
## [103] 56 112 129 40 98 143 216 234 112 112 70 386 154 185 156 156 134
## [120] 196 264 185 275 96 120 112 116 230 254 68 194 192 34 150 164 112
## [137] 224 32 318 244 48 94 138 108 112 226 192 174 228 19 170 220 128
## [154] 80 115 137 192 252 112 96 176 216 176 214 280 96 116 102 190 236
## [171] 192 84 330 208 145 259 126 264 81 164 42 123 162 100 286 190 168
## [188] 20 301 198 96 221 112 212 50 150 168 112 160 114 216 154 99 158
## [205] 216 252 112
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 4 4 Two_Story RL 60 9550 Pave None IR
## 8 8 Two_Story RL 0 10382 Pave None IR
## 9 9 One_Story RM 51 6120 Pave None Reg
## 15 15 One_Story RL 0 10920 Pave None IR
## 22 22 One_Story RM 57 7449 Pave Grvl Reg
## 30 30 One_Story RM 60 6324 Pave None IR
## 31 31 Two_Story C (all) 50 8500 Pave Pave Reg
## 49 49 PUD RM 33 4456 Pave None Reg
## 59 59 Two_Story RL 66 13682 Pave None IR
## 62 62 Two_Story RM 60 7200 Pave None Reg
## 64 64 Two_Story RM 50 10300 Pave None IR
## 70 70 One_Story RL 81 15593 Pave None Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 8 Lvl AllPub Corner Gtl NWAmes PosN
## 9 Lvl AllPub Inside Gtl OldTown Artery
## 15 Lvl AllPub Corner Gtl NAmes Norm
## 22 Bnk AllPub Inside Gtl IDOTRR Norm
## 30 Lvl AllPub Inside Gtl BrkSide Feedr
## 31 Lvl AllPub Inside Gtl IDOTRR Feedr
## 49 Lvl AllPub Inside Gtl OldTown Norm
## 59 HLS AllPub CulDSac Gtl StoneBr Norm
## 62 Lvl AllPub Inside Gtl IDOTRR Norm
## 64 Bnk AllPub Inside Gtl OldTown RRAn
## 70 Lvl AllPub Corner Gtl ClearCr Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 4 Norm 1Fam 2Story 7 5 1915
## 8 Norm 1Fam 2Story 7 6 1973
## 9 Norm 1Fam 1.5Fin 7 5 1931
## 15 Norm 1Fam 1Story 6 5 1960
## 22 Norm 1Fam 1.5Unf 7 7 1930
## 30 RRNn 1Fam 1Story 4 6 1927
## 31 Norm 1Fam 2Story 4 4 1920
## 49 Norm 2fmCon 2Story 4 5 1920
## 59 Norm 1Fam 2Story 10 5 2006
## 62 Norm 1Fam 2.5Unf 5 7 1920
## 64 Feedr 1Fam 2Story 7 6 1921
## 70 Norm 1Fam 1.5Fin 7 4 1953
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 8 1973 Gable CompShg HdBoard HdBoard Stone
## 9 1950 Gable CompShg BrkFace Wd Shng None
## 15 1960 Hip CompShg MetalSd MetalSd BrkFace
## 22 1950 Gable CompShg Wd Sdng Wd Sdng None
## 30 1950 Gable CompShg MetalSd MetalSd None
## 31 1950 Gambrel CompShg BrkFace BrkFace None
## 49 2008 Gable CompShg MetalSd MetalSd None
## 59 2006 Hip CompShg VinylSd VinylSd BrkFace
## 62 1996 Gable CompShg MetalSd MetalSd None
## 64 1950 Gable CompShg Stucco Stucco None
## 70 1953 Gable CompShg BrkFace AsbShng None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 4 0 TA TA BrkTil TA Gd
## 8 240 TA TA CBlock Gd TA
## 9 0 TA TA BrkTil TA TA
## 15 212 TA TA CBlock TA TA
## 22 0 TA TA PConc TA TA
## 30 0 TA TA BrkTil TA TA
## 31 0 TA Fa BrkTil TA TA
## 49 0 TA TA BrkTil TA TA
## 59 1031 Ex TA PConc Ex TA
## 62 0 TA TA BrkTil TA Fa
## 64 0 TA TA BrkTil TA TA
## 70 0 Gd TA CBlock TA TA
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 4 No ALQ 216 Unf 0
## 8 Mn ALQ 859 BLQ 32
## 9 No Unf 0 Unf 0
## 15 No BLQ 733 Unf 0
## 22 No Unf 0 Unf 0
## 30 No Unf 0 Unf 0
## 31 No Unf 0 Unf 0
## 49 No Unf 0 Unf 0
## 59 Gd Unf 0 Unf 0
## 62 No Unf 0 Unf 0
## 64 No Unf 0 Unf 0
## 70 No BLQ 603 Unf 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 4 540 756 GasA Gd Y SBrkr
## 8 216 1107 GasA Ex Y SBrkr
## 9 952 952 GasA Gd Y FuseF
## 15 520 1253 GasA TA Y SBrkr
## 22 637 637 GasA Ex Y FuseF
## 30 520 520 GasA Fa N SBrkr
## 31 649 649 GasA TA N SBrkr
## 49 736 736 GasA Gd Y SBrkr
## 59 1410 1410 GasA Ex Y SBrkr
## 62 530 530 GasA TA N SBrkr
## 64 576 576 GasA Gd Y SBrkr
## 70 701 1304 GasW TA Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 4 961 756 0 1717 1 0
## 8 1107 983 0 2090 1 0
## 9 1022 752 0 1774 0 0
## 15 1253 0 0 1253 1 0
## 22 1108 0 0 1108 0 0
## 30 520 0 0 520 0 0
## 31 649 668 0 1317 0 0
## 49 736 716 0 1452 0 0
## 59 1426 1519 0 2945 0 0
## 62 581 530 0 1111 0 0
## 64 902 808 0 1710 0 0
## 70 1304 983 0 2287 0 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 4 1 0 3 1 Gd 7
## 8 2 1 3 1 TA 7
## 9 2 0 2 2 TA 8
## 15 1 1 2 1 TA 5
## 22 1 0 3 1 Gd 6
## 30 1 0 1 1 Fa 4
## 31 1 0 3 1 TA 6
## 49 2 0 2 3 TA 8
## 59 3 1 3 1 Gd 10
## 62 1 0 3 1 Fa 6
## 64 2 0 3 1 TA 9
## 70 2 0 3 1 TA 7
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 4 Typ 1 Gd Detchd 1998 Unf
## 8 Typ 2 TA Attchd 1973 RFn
## 9 Min1 2 TA Detchd 1931 Unf
## 15 Typ 1 Fa Attchd 1960 RFn
## 22 Typ 1 Gd Attchd 1930 Unf
## 30 Typ 0 NoF Detchd 1920 Unf
## 31 Typ 0 NoF Detchd 1920 Unf
## 49 Typ 0 NoF NoG NoG NoG
## 59 Typ 1 Gd BuiltIn 2006 Fin
## 62 Typ 0 NoF Detchd 1935 Unf
## 64 Typ 0 NoF Detchd 1990 Unf
## 70 Typ 1 TA Attchd 1953 Fin
## GarageCars GarageArea GarageQual GarageCond PavedDrive WoodDeckSF
## 4 3 642 TA TA Y 0
## 8 2 484 TA TA Y 235
## 9 2 468 Fa TA Y 90
## 15 1 352 TA TA Y 0
## 22 1 280 TA TA N 0
## 30 1 240 Fa TA Y 49
## 31 1 250 TA Fa N 0
## 49 0 0 NoG NoG N 0
## 59 3 641 TA TA Y 192
## 62 1 288 TA TA N 0
## 64 2 480 TA TA Y 12
## 70 2 667 TA TA Y 0
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC
## 4 35 272 0 0 0 NoP
## 8 204 228 0 0 0 NoP
## 9 0 205 0 0 0 NoP
## 15 213 176 0 0 0 NoP
## 22 0 205 0 0 0 NoP
## 30 0 87 0 0 0 NoP
## 31 54 172 0 0 0 NoP
## 49 0 102 0 0 0 NoP
## 59 0 37 0 0 0 NoP
## 62 0 144 0 0 0 NoP
## 64 11 64 0 0 0 NoP
## 70 21 114 0 0 0 NoP
## Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
## 4 NoF NoM 0 2 2006 WD Abnorml
## 8 NoF Shed 350 11 2009 WD Normal
## 9 NoF NoM 0 4 2008 WD Abnorml
## 15 GdWo NoM 0 5 2008 WD Normal
## 22 GdPrv NoM 0 6 2007 WD Normal
## 30 NoF NoM 0 5 2008 WD Normal
## 31 MnPrv NoM 0 7 2008 WD Normal
## 49 NoF NoM 0 6 2009 New Partial
## 59 NoF NoM 0 10 2006 New Partial
## 62 NoF NoM 0 3 2007 WD Normal
## 64 GdPrv NoM 0 4 2010 WD Normal
## 70 NoF NoM 0 7 2006 WD Normal
## SalePrice
## 4 140000
## 8 200000
## 9 129900
## 15 157000
## 22 139400
## 30 68500
## 31 40000
## 49 113000
## 59 438780
## 62 101000
## 64 140000
## 70 225000
## [ reached getOption("max.print") -- omitted 195 rows ]
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "X3SsnPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_X3Ss = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 24
## [1] 320 407 130 180 168 180 140 508 238 245 196 144 144 182 168 162 23
## [18] 168 216 96 216 153 290 304
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "ScreenPorch"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_screenPorch = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 116
## [1] 176 198 291 252 99 184 168 130 142 192 410 224 266 170 154 153 144
## [18] 142 128 259 160 198 271 234 184 374 192 185 182 90 144 224 396 170
## [35] 176 140 276 192 180 161 168 145 200 122 95 144 120 60 120 126 189
## [52] 260 147 385 287 200 156 100 180 216 210 197 204 192 225 192 152 175
## [69] 126 312 222 265 224 322 120 190 233 63 147 180 53 143 189 189 189
## [86] 192 160 160 126 100 273 180 90 288 263 224 147 120 80 163 90 288
## [103] 116 259 224 216 480 120 178 440 155 168 220 119 165 40
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolArea"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_poolarea = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 7
## [1] 512 648 576 555 480 519 738
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "PoolQC"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Ex 2
## 2 Fa 2
## 3 Gd 3
## 4 NoP 1442
Tooo big of a variance
attrib = attrib + 1
colname[attrib]
## [1] "Fence"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 GdPrv 59
## 2 GdWo 54
## 3 MnPrv 156
## 4 MnWw 11
## 5 NoF 1169
attrib = attrib + 1
colname[attrib]
## [1] "MiscFeature"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 Gar2 2
## 2 NoM 1395
## 3 Othr 2
## 4 Shed 49
## 5 TenC 1
attrib = attrib + 1
colname[attrib]
## [1] "MiscVal"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
outlier_miscVal = subset(temp, temp[,attrib]>graph$stats[5]|temp[,attrib]<graph$stats[1])
gg = gg+1
outers[gg] = attrib
}
## [1] 52
## [1] 700 350 700 500 400 700 480 400 400 450 450
## [12] 500 450 700 400 15500 1200 800 480 400 2000 2000
## [23] 600 500 600 600 3500 500 400 450 500 1300 1200
## [34] 500 400 54 500 400 400 2000 620 400 560 500
## [45] 700 1400 400 8300 600 1150 2000 2500
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
attrib = attrib + 1
colname[attrib]
## [1] "MoSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 1 58
## 2 10 89
## 3 11 78
## 4 12 58
## 5 2 52
## 6 3 104
## 7 4 140
## 8 5 201
## 9 6 253
## 10 7 233
## 11 8 121
## 12 9 62
attrib = attrib + 1
colname[attrib]
## [1] "YrSold"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 2006 313
## 2 2007 327
## 3 2008 299
## 4 2009 336
## 5 2010 174
attrib = attrib + 1
colname[attrib]
## [1] "SaleType"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
global = global + 1
toremove[global] = attrib
}
## Var1 Freq
## 1 COD 43
## 2 Con 2
## 3 ConLD 9
## 4 ConLI 5
## 5 ConLw 5
## 6 CWD 4
## 7 New 119
## 8 Oth 3
## 9 WD 1259
To drop because of variance
attrib = attrib + 1
colname[attrib]
## [1] "SaleCondition"
if (typeof(temp[,attrib]) == 'integer' ||(typeof(temp[,attrib]) == 'double')){
graph = boxplot(temp[,attrib])
print(length(graph$out))
print(graph$out)
outlierID = which(temp[,attrib]>graph$stats[5],)
temp[outlierID,]
}
if (typeof(temp[,attrib]) == 'character'){
print(as.data.frame(table(temp[,attrib])))
print(ggplot(data = temp, aes(x=temp[,attrib], y=temp$SalePrice)) + geom_boxplot(aes(fill = factor(temp[,attrib]) )) +scale_x_discrete(name = (colname[attrib])) + labs(fill = (colname[attrib])))
#global = global + 1
#toremove[global] = attrib
}
## Var1 Freq
## 1 Abnorml 101
## 2 AdjLand 4
## 3 Alloca 11
## 4 Family 20
## 5 Normal 1191
## 6 Partial 122
Time to make some decisons.
print("The following attributes are under review to be removed:")
## [1] "The following attributes are under review to be removed:"
colname[toremove]
## [1] "MSZoning" "Street" "Alley" "LandContour" "Utilities"
## [6] "LotConfig" "LandSlope" "Condition1" "Condition2" "BldgType"
## [11] "RoofMatl" "BsmtCond" "Heating" "CentralAir" "Electrical"
## [16] "Functional" "GarageQual" "GarageCond" "PavedDrive" "PoolQC"
## [21] "Fence" "MiscFeature" "SaleType"
print("The following attributes have outliers that need to be addressed:")
## [1] "The following attributes have outliers that need to be addressed:"
colname[outers]
## [1] "LotFrontage" "LotArea" "YearBuilt" "MasVnrArea"
## [5] "BsmtFinSF1" "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF"
## [9] "X1stFlrSF" "X2ndFlrSF" "LowQualFinSF" "GrLivArea"
## [13] "GarageCars" "GarageArea" "WoodDeckSF" "OpenPorchSF"
## [17] "X3SsnPorch" "ScreenPorch" "PoolArea" "MiscVal"
We will be working on checking the vairance of each variable. We want high variance.
#The ruleset I am emplaying is if there is one variable with over 65% of the observations or 2 with over 70%, I shall remove the attribute based of low variance.
temp[toremove] <- NULL
We shall look at the matrix of the numeric attributes and remove any that have higher than 0.75 correlation with the dependent variable
#Subsetting for numeric only
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)
#We are only intrested in the Sales price of the matrix
aa[,26]
## Id LotFrontage LotArea OverallQual OverallCond
## -0.02477111 0.20850483 0.26464334 0.79002549 -0.07550025
## YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
## 0.52333350 0.50800276 0.47733825 0.38418943 -0.01040337
## BsmtUnfSF TotalBsmtSF X1stFlrSF X2ndFlrSF LowQualFinSF
## 0.21536457 0.61316258 0.60680522 0.32289059 -0.02530248
## GrLivArea GarageCars GarageArea WoodDeckSF OpenPorchSF
## 0.71019194 0.63961511 0.62245235 0.32498348 0.31112376
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
## -0.12892149 0.04522141 0.11299246 0.09310460 -0.02098008
## SalePrice
## 1.00000000
There are some conclusions we can make.
#Might be better for us to remove 4 attributes and create a fuller 1 attribute
temp$HouseArea <- temp$TotalBsmtSF + temp$X1stFlrSF + temp$X2ndFlrSF
# Undo some work from before for this stage.
temp$FullBath <- as.numeric(temp$FullBath)
temp$BsmtFullBath <- as.numeric(temp$BsmtFullBath)
temp$HalfBath <- as.numeric(temp$HalfBath)
temp$BsmtHalfBath <- as.numeric(temp$BsmtHalfBath)
# Why keep so many bathroom attributes. Let us condense instead.
temp$TotalBath <- temp$FullBath + 0.5*(temp$HalfBath) + temp$BsmtFullBath + 0.5*as.numeric(temp$BsmtHalfBath)
# Same with porch
temp$TotalPorchSF <- temp$OpenPorchSF + temp$EnclosedPorch + temp$X3SsnPorch + temp$ScreenPorch + temp$WoodDeckSF
# Lets include the garage now
temp$TotalArea <-temp$HouseArea + temp$GarageArea
#We also know that total basement SF = BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF. WHy not remove it
#Another reason so remove total basement SF is because of its high correlation to 1stFlrSF
temp$TotalBsmtSF <- NULL
#Do not need house area, if we have total area
temp$HouseArea <- NULL
We will remove OverallQual and GrLiveArea due to its high correlation. (above our 0.7 threshold) GarageCars and Garagearea also have high correlation, and they themselves are highly correlated (0.88), so I shall remove one because they pretty much provide the same amount of information to the dependent variable. I pick Garagearea to remove because there are already a lot of area variables.
Also lets remove the attributes used in the addition, as well.
temp$OverallQual <- NULL
temp$GrLivArea <- NULL
temp$GarageArea <- NULL
temp$X1stFlrSF <-NULL
temp$X2ndFlrSF <- NULL
temp$FullBath <- NULL
temp$HalfBath <-NULL
temp$BsmtFullBath <- NULL
temp$BsmtHalfBath <- NULL
temp$OpenPorchSF <- NULL
temp$EnclosedPorch <- NULL
temp$X3SsnPorch <- NULL
temp$ScreenPorch <- NULL
temp$WoodDeckSF <- NULL
temp$PoolArea <- NULL
After our reduction, lets see our variables
colname_new <- colnames(temp)
colname_new
## [1] "Id" "MSSubClass" "LotFrontage" "LotArea"
## [5] "LotShape" "Neighborhood" "HouseStyle" "OverallCond"
## [9] "YearBuilt" "YearRemodAdd" "RoofStyle" "Exterior1st"
## [13] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [17] "ExterCond" "Foundation" "BsmtQual" "BsmtExposure"
## [21] "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2" "BsmtFinSF2"
## [25] "BsmtUnfSF" "HeatingQC" "LowQualFinSF" "BedroomAbvGr"
## [29] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Fireplaces"
## [33] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish"
## [37] "GarageCars" "MiscVal" "MoSold" "YrSold"
## [41] "SaleCondition" "SalePrice" "TotalBath" "TotalPorchSF"
## [45] "TotalArea"
We removed 25 variables so far, and added 3
#Only intrested in the attributes we added, but lets take a look
nums <- unlist(lapply(temp, is.numeric))
numONLY = temp[,nums]
aa<-cor(numONLY)
ggcorrplot(aa)
#We are only intrested in the Sales price of the matrix
aa[,15]
## Id LotFrontage LotArea OverallCond YearBuilt
## 0.004045642 0.086229005 0.203137137 -0.168197692 0.524335321
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 0.444130722 0.319482646 0.479579877 0.053856240 -0.104937056
## LowQualFinSF GarageCars MiscVal SalePrice TotalBath
## -0.040678483 0.482803779 -0.025550618 0.631961576 1.000000000
## TotalPorchSF TotalArea
## 0.313955559 0.617533096
diag(var(numONLY))
## Id LotFrontage LotArea OverallCond YearBuilt
## 1.779660e+05 1.202679e+03 9.998646e+07 1.237127e+00 9.119621e+02
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 4.265806e+02 3.283680e+04 2.075615e+05 2.620502e+04 1.956699e+05
## LowQualFinSF GarageCars MiscVal SalePrice TotalBath
## 2.381903e+03 5.609260e-01 2.479934e+05 6.297114e+09 6.144431e-01
## TotalPorchSF TotalArea
## 2.461454e+04 9.187692e+05
As assumed, the areas will be correlated with one another, highly. Also the sales price is largely correlated with areas, bathrooms. I know that these areas are highly correlated but because we added so many variables into it, I will not be removign them.
Finally, we will look at addressing the outliers. I was holding off for as long as I could because they may potentially be eliminated with the removal of some of the attributes, but we might be at a point where no more attributes are to be taken out.
Lets look for some patterns. A lot of it was discovered from the correlation table, but lets reillustrate some of it here. Truth be told, in our univariant work, we looked at some intresting combinations with the Sales price. I will re highlight these and look at some other combinations.
#Lets look at Lot
plot(temp$LotFrontage, temp$LotArea)
ggplot(data = temp, aes(y=temp$LotFrontage, x=temp$LotShape)) + geom_jitter(aes((temp$LotShape) ))
Nothing meaningful
plot(temp$TotalBath,temp$SalePrice)
plot(temp$TotalArea,temp$SalePrice)
plot(temp$GarageCars, temp$SalePrice)
plot(temp$YearBuilt, temp$SalePrice)
plot(temp$YearRemodAdd, temp$SalePrice)
#These attributes had the highest correlation with sales price. It is evident in these plot.
#Something else we can see from the last prompts is that larger homes would have more bathrooms, and in turn higher prices.
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$TotalBath)
#Larger homes would also have better quality fireplaces.
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$FireplaceQu)
#And better exterior quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$ExterQual)
#And better Basement quality
qplot(temp$TotalArea,temp$SalePrice, data = temp, colour= temp$BsmtQual)
#QUality in general makes for a better house price
qplot(temp$ExterQual,temp$SalePrice, data = temp, colour= temp$BsmtQual)
# Very imporant comparisons, used to make decisions. Great to find patterns
ggplot(data = temp, aes(x=temp$ExterQual, y=temp$SalePrice)) + geom_count(aes( factor(temp$ExterQual) ))
ggplot(data = temp, aes(x=temp$KitchenQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$KitchenQual) ))
ggplot(data = temp, aes(x=temp$BsmtQual, y=temp$SalePrice)) + geom_count(aes(factor(temp$BsmtQual) ))
ggplot(data = temp, aes(x=temp$FireplaceQu, y=temp$SalePrice)) + geom_count(aes(factor(temp$FireplaceQu) ))
#ggplot(data = temp, aes(x=temp$Fireplace, y=temp$SalePrice)) + geom_count(aes((temp$Fireplace) ))
qplot(temp$Fireplaces,temp$SalePrice, data = temp)
ggplot(data = temp, aes(x=temp$MiscVal, y=temp$SalePrice)) + geom_count(aes((temp$MiscVal) ))
ggplot(data = temp, aes(x=temp$BsmtFinSF1, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF1) ))
ggplot(data = temp, aes(x=temp$BsmtFinSF2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinSF2) ))
ggplot(data = temp, aes(x=temp$BsmtFinType2, y=temp$SalePrice)) + geom_count(aes((temp$BsmtFinType2) )) #This got through the cracks, it should have been removed.
ggplot(data = temp, aes(x=temp$MasVnrArea, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrArea) ))
ggplot(data = temp, aes(x=temp$MasVnrType, y=temp$SalePrice)) + geom_count(aes((temp$MasVnrType) ))
ggplot(data = temp, aes(x=temp$BsmtUnfSF, y=temp$SalePrice)) + geom_count(aes((temp$BsmtUnfSF) ))
mean(temp$BsmtUnfSF)
## [1] 567.4651
nrow(temp[temp$BsmtUnfSF<200,])#I decide to keep this, attribute
## [1] 335
ggplot(data = temp, aes(x=temp$LowQualFinSF, y=temp$SalePrice)) + geom_count(aes((temp$LowQualFinSF) ))
ggplot(data = temp, aes(x=temp$GarageCars, y=temp$SalePrice)) + geom_count(aes((temp$GarageCars) ))
ggplot(data = temp, aes(x=temp$LotFrontage, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))
nrow(temp[temp$LotFrontage<10,])#I decide to keep this, attribute
## [1] 257
ggplot(data = temp, aes(x=temp$tot, y=temp$SalePrice)) + geom_count(aes((temp$LotFrontage) ))
mean(temp$PoolArea)
## Warning in mean.default(temp$PoolArea): argument is not numeric or logical:
## returning NA
## [1] NA
mean(temp$MiscVal)
## [1] 43.81919
mean(as.numeric(temp$Fireplaces))
## [1] 0.6128364
#After observing the graphics and looking into variances and mean, these need to be removed. The means alone tell a story of how skewed of a picture these give. Because of a lack of normalization, means can show a picture as well.
temp$Fireplace <- NULL
temp$PoolArea <- NULL
temp$MiscVal <- NULL
temp$BsmtFinSF2 <- NULL
temp$MasVnrArea <- NULL
temp$LowQualFinSF <-NULL
I saved this for as late as possible. I wanted to see if most of the attributes will be eliminated before we remove observations due to outliers.
#bsmtfinsf1
flattened_outlier = unlist(outlier_bsmtFinSF1[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed bsmtfinsf2
outlier_bsmtFinSF2 = 0
#removed masvnrarea
outlier_masVnrArea = 0
#bsmtUnfSF
flattened_outlier = unlist(outlier_bsmtUnfSF[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed garage area
outlier_garageArea = 0
#removed garage car
flattened_outlier = unlist(outlier_garagecars[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed grlivarea
outlier_grLivArea = 0
#lotarea
flattened_outlier = unlist(outlier_lotArea[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#lotfrontage
flattened_outlier = unlist(outlier_lotFrontage[1], use.names = FALSE)
temp = subset(temp, !(temp$Id %in% flattened_outlier))
#removed lowqualfin
outlier_lowQualFinSF = 0
#removed misc
outlier_miscVal = 0
#removed OpenPorch
outlier_openPorch = 0
#removed pool
outlier_poolarea = 0
#removed screen porch
outlier_screenPorch= 0
#removed total Basement SF
outlier_totalBsmtSF= 0
#removed wood deck
outlier_woodDeck= 0
#removed 1st flr sf
outlier_x1stFlrSF= 0
#removed 2nd flr sf
outlier_x2stFlrSF= 0
#removed 3Ss
outlier_X3Ss= 0
print("it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later")
## [1] "it is a good idea we waited to remove the outliers, otherwise we would have lost a lot of observations only to remove the attributes later"
#I will allow this outliers:
#outlier_yearbuilt
#after all our intial analysis, we can set the data back to our original name
modified_data = temp
modified_data = subset(modified_data, select=-c(SalePrice))
modified_data$SalePrice = temp$SalePrice
# Our Normalizing technique
normalize <- function(x) {
if (is.numeric(x)){
return ((x - min(x)) / (max(x) - min(x))) }
else{
return (x)
}
}
data_norm = as.data.frame(lapply(modified_data[2:39], normalize))
data_norm <- cbind(modified_data$SalePrice, data_norm)
numsonly <- unlist(lapply(modified_data, is.numeric))
numarray = temp[,numsonly]
numarray = subset(numarray, select = -c(Id))
fit = kmeans(numarray,4)
plotcluster(numarray,fit$cluster)
#str(fit)
fit = kmodes(numarray, 4)
## Warning in kmodes(numarray, 4): data has numeric coloumns with more than 30
## different levels!
plotcluster(numarray,fit$cluster)
# It is not meaningful to have clustering for categorical variables. I did only numerical values.
In fact, these clusters are not meaningful for the numerical only attributes, either! It would have been a better tool to be used in EDA of 2-5 variables perhaps, but I will do without it.
set.seed(1)
#Splitting training to 80%, test to 20%
index <- sample(1:nrow(data_norm), 0.80 *nrow(data_norm))
data_train <- data_norm[index,]
data_test <- data_norm[-index,]
#Our label is the Sales price, in col 1
trainlabel <- data_train[,1]
testlabel <- data_test[,1]
#Applying KNN
##test_pred <- knn(train = data_train[,2:39], test = data_test[,2:39],cl = data_train[,1], k=9)
#Creating accuracy matrix
##CrossTable(x=testlabel, y=test_pred, prop.chisq=FALSE)
Learned that KNN might not be a good suit for someone with categorical data within the DF. We shall need to look at regression.
#tree = rpart(data_train$'modified_data$SalePrice'~MSSubClass+LotFrontage+LotArea+LotShape+Neighborhood+HouseStyle+OverallCond+YearBuilt+YearRemodAdd+RoofStyle+Exterior1st+Exterior2nd+MasVnrType+ExterQual+ExterCond+Foundation+BsmtQual+BsmtExposure+BsmtFinType1+BsmtFinSF1+BsmtFinType2+BsmtUnfSF+HeatingQC+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+FireplaceQu+GarageType+GarageYrBlt+GarageFinish+GarageCars+MoSold+YrSold+SaleCondition+TotalBath+TotalPorchSF+TotalArea, data_train, method = "class")